sys/net/pf/pf.c

   1 /*
   2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
   3  *
   4  * Copyright (c) 2001 Daniel Hartmeier
   5  * Copyright (c) 2002 - 2008 Henning Brauer
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  *    - Redistributions of source code must retain the above copyright
  13  *      notice, this list of conditions and the following disclaimer.
  14  *    - Redistributions in binary form must reproduce the above
  15  *      copyright notice, this list of conditions and the following
  16  *      disclaimer in the documentation and/or other materials provided
  17  *      with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  *
  32  * Effort sponsored in part by the Defense Advanced Research Projects
  33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  35  *
  36  */
  37
  38 #include "opt_inet.h"
  39 #include "opt_inet6.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/malloc.h>
  44 #include <sys/mbuf.h>
  45 #include <sys/filio.h>
  46 #include <sys/socket.h>
  47 #include <sys/socketvar.h>
  48 #include <sys/kernel.h>
  49 #include <sys/time.h>
  50 #include <sys/sysctl.h>
  51 #include <sys/endian.h>
  52 #include <sys/proc.h>
  53 #include <sys/kthread.h>
  54 #include <sys/spinlock.h>
  55
  56 #include <machine/inttypes.h>
  57
  58 #include <sys/md5.h>
  59
  60 #include <net/if.h>
  61 #include <net/if_types.h>
  62 #include <net/bpf.h>
  63 #include <net/netisr2.h>
  64 #include <net/route.h>
  65
  66 #include <netinet/in.h>
  67 #include <netinet/in_var.h>
  68 #include <netinet/in_systm.h>
  69 #include <netinet/ip.h>
  70 #include <netinet/ip_var.h>
  71 #include <netinet/tcp.h>
  72 #include <netinet/tcp_seq.h>
  73 #include <netinet/udp.h>
  74 #include <netinet/ip_icmp.h>
  75 #include <netinet/in_pcb.h>
  76 #include <netinet/tcp_timer.h>
  77 #include <netinet/tcp_var.h>
  78 #include <netinet/udp_var.h>
  79 #include <netinet/icmp_var.h>
  80 #include <netinet/if_ether.h>
  81
  82 #include <net/pf/pfvar.h>
  83 #include <net/pf/if_pflog.h>
  84
  85 #include <net/pf/if_pfsync.h>
  86
  87 #ifdef INET6
  88 #include <netinet/ip6.h>
  89 #include <netinet/icmp6.h>
  90 #include <netinet6/nd6.h>
  91 #include <netinet6/ip6_var.h>
  92 #include <netinet6/in6_pcb.h>
  93 #endif /* INET6 */
  94
  95 #include <sys/in_cksum.h>
  96 #include <sys/ucred.h>
  97 #include <machine/limits.h>
  98 #include <sys/msgport2.h>
  99 #include <sys/spinlock2.h>
 100 #include <net/netmsg2.h>
 101 #include <net/toeplitz2.h>
 102
 103 extern int ip_optcopy(struct ip *, struct ip *);
 104 extern int debug_pfugidhack;
 105
 106 /*
 107  * pf_token - shared lock for cpu-localized operations,
 108  *            exclusive lock otherwise.
 109  *
 110  * pf_gtoken- exclusive lock used for initialization.
 111  *
 112  * pf_spin  - only used to atomically fetch and increment stateid
 113  *            on 32-bit systems.
 114  */
 115 struct lwkt_token pf_token = LWKT_TOKEN_INITIALIZER(pf_token);
 116 struct lwkt_token pf_gtoken = LWKT_TOKEN_INITIALIZER(pf_gtoken);
 117 #if __SIZEOF_LONG__ != 8
 118 struct spinlock pf_spin = SPINLOCK_INITIALIZER(pf_spin, "pf_spin");
 119 #endif
 120
 121 #define DPFPRINTF(n, x) if (pf_status.debug >= (n)) kprintf x
 122
 123 #define FAIL(code)      { error = (code); goto done; }
 124
 125 /*
 126  * Global variables
 127  */
 128
 129 /* mask radix tree */
 130 struct radix_node_head  *pf_maskhead;
 131
 132 /* state tables */
 133 struct pf_state_tree     pf_statetbl[MAXCPU+1]; /* incls one global table */
 134
 135 struct pf_altqqueue      pf_altqs[2];
 136 struct pf_palist         pf_pabuf;
 137 struct pf_altqqueue     *pf_altqs_active;
 138 struct pf_altqqueue     *pf_altqs_inactive;
 139 struct pf_status         pf_status;
 140
 141 u_int32_t                ticket_altqs_active;
 142 u_int32_t                ticket_altqs_inactive;
 143 int                      altqs_inactive_open;
 144 u_int32_t                ticket_pabuf;
 145
 146 MD5_CTX                  pf_tcp_secret_ctx;
 147 u_char                   pf_tcp_secret[16];
 148 int                      pf_tcp_secret_init;
 149 int                      pf_tcp_iss_off;
 150
 151 struct pf_anchor_stackframe {
 152         struct pf_ruleset                       *rs;
 153         struct pf_rule                          *r;
 154         struct pf_anchor_node                   *parent;
 155         struct pf_anchor                        *child;
 156 } pf_anchor_stack[64];
 157
 158 struct malloc_type       *pf_src_tree_pl, *pf_rule_pl, *pf_pooladdr_pl;
 159 struct malloc_type       *pf_state_pl, *pf_state_key_pl, *pf_state_item_pl;
 160 struct malloc_type       *pf_altq_pl;
 161
 162 void                     pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
 163
 164 void                     pf_init_threshold(struct pf_threshold *, u_int32_t,
 165                             u_int32_t);
 166 void                     pf_add_threshold(struct pf_threshold *);
 167 int                      pf_check_threshold(struct pf_threshold *);
 168
 169 void                     pf_change_ap(struct pf_addr *, u_int16_t *,
 170                             u_int16_t *, u_int16_t *, struct pf_addr *,
 171                             u_int16_t, u_int8_t, sa_family_t);
 172 int                      pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
 173                             struct tcphdr *, struct pf_state_peer *);
 174 #ifdef INET6
 175 void                     pf_change_a6(struct pf_addr *, u_int16_t *,
 176                             struct pf_addr *, u_int8_t);
 177 #endif /* INET6 */
 178 void                     pf_change_icmp(struct pf_addr *, u_int16_t *,
 179                             struct pf_addr *, struct pf_addr *, u_int16_t,
 180                             u_int16_t *, u_int16_t *, u_int16_t *,
 181                             u_int16_t *, u_int8_t, sa_family_t);
 182 void                     pf_send_tcp(const struct pf_rule *, sa_family_t,
 183                             const struct pf_addr *, const struct pf_addr *,
 184                             u_int16_t, u_int16_t, u_int32_t, u_int32_t,
 185                             u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
 186                             u_int16_t, struct ether_header *, struct ifnet *);
 187 void                     pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
 188                             sa_family_t, struct pf_rule *);
 189 struct pf_rule          *pf_match_translation(struct pf_pdesc *, struct mbuf *,
 190                             int, int, struct pfi_kif *,
 191                             struct pf_addr *, u_int16_t, struct pf_addr *,
 192                             u_int16_t, int);
 193 struct pf_rule          *pf_get_translation(struct pf_pdesc *, struct mbuf *,
 194                             int, int, struct pfi_kif *, struct pf_src_node **,
 195                             struct pf_state_key **, struct pf_state_key **,
 196                             struct pf_state_key **, struct pf_state_key **,
 197                             struct pf_addr *, struct pf_addr *,
 198                             u_int16_t, u_int16_t);
 199 void                     pf_detach_state(struct pf_state *);
 200 int                      pf_state_key_setup(struct pf_pdesc *, struct pf_rule *,
 201                             struct pf_state_key **, struct pf_state_key **,
 202                             struct pf_state_key **, struct pf_state_key **,
 203                             struct pf_addr *, struct pf_addr *,
 204                             u_int16_t, u_int16_t);
 205 void                     pf_state_key_detach(struct pf_state *, int);
 206 u_int32_t                pf_tcp_iss(struct pf_pdesc *);
 207 int                      pf_test_rule(struct pf_rule **, struct pf_state **,
 208                             int, struct pfi_kif *, struct mbuf *, int,
 209                             void *, struct pf_pdesc *, struct pf_rule **,
 210                             struct pf_ruleset **, struct ifqueue *, struct inpcb *);
 211 static __inline int      pf_create_state(struct pf_rule *, struct pf_rule *,
 212                             struct pf_rule *, struct pf_pdesc *,
 213                             struct pf_src_node *, struct pf_state_key *,
 214                             struct pf_state_key *, struct pf_state_key *,
 215                             struct pf_state_key *, struct mbuf *, int,
 216                             u_int16_t, u_int16_t, int *, struct pfi_kif *,
 217                             struct pf_state **, int, u_int16_t, u_int16_t,
 218                             int);
 219 int                      pf_test_fragment(struct pf_rule **, int,
 220                             struct pfi_kif *, struct mbuf *, void *,
 221                             struct pf_pdesc *, struct pf_rule **,
 222                             struct pf_ruleset **);
 223 int                      pf_tcp_track_full(struct pf_state_peer *,
 224                             struct pf_state_peer *, struct pf_state **,
 225                             struct pfi_kif *, struct mbuf *, int,
 226                             struct pf_pdesc *, u_short *, int *);
 227 int                     pf_tcp_track_sloppy(struct pf_state_peer *,
 228                             struct pf_state_peer *, struct pf_state **,
 229                             struct pf_pdesc *, u_short *);
 230 int                      pf_test_state_tcp(struct pf_state **, int,
 231                             struct pfi_kif *, struct mbuf *, int,
 232                             void *, struct pf_pdesc *, u_short *);
 233 int                      pf_test_state_udp(struct pf_state **, int,
 234                             struct pfi_kif *, struct mbuf *, int,
 235                             void *, struct pf_pdesc *);
 236 int                      pf_test_state_icmp(struct pf_state **, int,
 237                             struct pfi_kif *, struct mbuf *, int,
 238                             void *, struct pf_pdesc *, u_short *);
 239 int                      pf_test_state_other(struct pf_state **, int,
 240                             struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
 241 void                     pf_step_into_anchor(int *, struct pf_ruleset **, int,
 242                             struct pf_rule **, struct pf_rule **, int *);
 243 int                      pf_step_out_of_anchor(int *, struct pf_ruleset **,
 244                              int, struct pf_rule **, struct pf_rule **,
 245                              int *);
 246 void                     pf_hash(struct pf_addr *, struct pf_addr *,
 247                             struct pf_poolhashkey *, sa_family_t);
 248 int                      pf_map_addr(u_int8_t, struct pf_rule *,
 249                             struct pf_addr *, struct pf_addr *,
 250                             struct pf_addr *, struct pf_src_node **);
 251 int                      pf_get_sport(struct pf_pdesc *,
 252                             sa_family_t, u_int8_t, struct pf_rule *,
 253                             struct pf_addr *, struct pf_addr *,
 254                             u_int16_t, u_int16_t,
 255                             struct pf_addr *, u_int16_t *,
 256                             u_int16_t, u_int16_t,
 257                             struct pf_src_node **);
 258 void                     pf_route(struct mbuf **, struct pf_rule *, int,
 259                             struct ifnet *, struct pf_state *,
 260                             struct pf_pdesc *);
 261 void                     pf_route6(struct mbuf **, struct pf_rule *, int,
 262                             struct ifnet *, struct pf_state *,
 263                             struct pf_pdesc *);
 264 u_int8_t                 pf_get_wscale(struct mbuf *, int, u_int16_t,
 265                             sa_family_t);
 266 u_int16_t                pf_get_mss(struct mbuf *, int, u_int16_t,
 267                             sa_family_t);
 268 u_int16_t                pf_calc_mss(struct pf_addr *, sa_family_t,
 269                                 u_int16_t);
 270 void                     pf_set_rt_ifp(struct pf_state *,
 271                             struct pf_addr *);
 272 int                      pf_check_proto_cksum(struct mbuf *, int, int,
 273                             u_int8_t, sa_family_t);
 274 struct pf_divert        *pf_get_divert(struct mbuf *);
 275 void                     pf_print_state_parts(struct pf_state *,
 276                             struct pf_state_key *, struct pf_state_key *);
 277 int                      pf_addr_wrap_neq(struct pf_addr_wrap *,
 278                             struct pf_addr_wrap *);
 279 struct pf_state         *pf_find_state(struct pfi_kif *,
 280                             struct pf_state_key_cmp *, u_int, struct mbuf *);
 281 int                      pf_src_connlimit(struct pf_state *);
 282 int                      pf_check_congestion(struct ifqueue *);
 283
 284 extern int pf_end_threads;
 285
 286 struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
 287         { &pf_state_pl, PFSTATE_HIWAT },
 288         { &pf_src_tree_pl, PFSNODE_HIWAT },
 289         { &pf_frent_pl, PFFRAG_FRENT_HIWAT },
 290         { &pfr_ktable_pl, PFR_KTABLE_HIWAT },
 291         { &pfr_kentry_pl, PFR_KENTRY_HIWAT }
 292 };
 293
 294 /*
 295  * If route-to and direction is out we match with no further processing
 296  *      (rt_kif must be assigned and not equal to the out interface)
 297  * If reply-to and direction is in we match with no further processing
 298  *      (rt_kif must be assigned and not equal to the in interface)
 299  */
 300 #define STATE_LOOKUP(i, k, d, s, m)                                     \
 301         do {                                                            \
 302                 s = pf_find_state(i, k, d, m);                          \
 303                 if (s == NULL || (s)->timeout == PFTM_PURGE)            \
 304                         return (PF_DROP);                               \
 305                 if (d == PF_OUT &&                                      \
 306                     (((s)->rule.ptr->rt == PF_ROUTETO &&                \
 307                     (s)->rule.ptr->direction == PF_OUT) ||              \
 308                     ((s)->rule.ptr->rt == PF_REPLYTO &&                 \
 309                     (s)->rule.ptr->direction == PF_IN)) &&              \
 310                     (s)->rt_kif != NULL &&                              \
 311                     (s)->rt_kif != i)                                   \
 312                         return (PF_PASS);                               \
 313         } while (0)
 314
 315 #define BOUND_IFACE(r, k) \
 316         ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all
 317
 318 #define STATE_INC_COUNTERS(s)                           \
 319         do {                                            \
 320                 atomic_add_int(&s->rule.ptr->states_cur, 1);    \
 321                 s->rule.ptr->states_tot++;              \
 322                 if (s->anchor.ptr != NULL) {            \
 323                         atomic_add_int(&s->anchor.ptr->states_cur, 1);  \
 324                         s->anchor.ptr->states_tot++;    \
 325                 }                                       \
 326                 if (s->nat_rule.ptr != NULL) {          \
 327                         atomic_add_int(&s->nat_rule.ptr->states_cur, 1); \
 328                         s->nat_rule.ptr->states_tot++;  \
 329                 }                                       \
 330         } while (0)
 331
 332 #define STATE_DEC_COUNTERS(s)                           \
 333         do {                                            \
 334                 if (s->nat_rule.ptr != NULL)            \
 335                         atomic_add_int(&s->nat_rule.ptr->states_cur, -1); \
 336                 if (s->anchor.ptr != NULL)              \
 337                         atomic_add_int(&s->anchor.ptr->states_cur, -1); \
 338                 atomic_add_int(&s->rule.ptr->states_cur, -1);           \
 339         } while (0)
 340
 341 static MALLOC_DEFINE(M_PFSTATEPL, "pfstatepl", "pf state pool list");
 342 static MALLOC_DEFINE(M_PFSRCTREEPL, "pfsrctpl", "pf source tree pool list");
 343 static MALLOC_DEFINE(M_PFSTATEKEYPL, "pfstatekeypl", "pf state key pool list");
 344 static MALLOC_DEFINE(M_PFSTATEITEMPL, "pfstateitempl", "pf state item pool list");
 345
 346 static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
 347 static __inline int pf_state_compare_key(struct pf_state_key *,
 348                                 struct pf_state_key *);
 349 static __inline int pf_state_compare_rkey(struct pf_state_key *,
 350                                 struct pf_state_key *);
 351 static __inline int pf_state_compare_id(struct pf_state *,
 352                                 struct pf_state *);
 353
 354 struct pf_src_tree tree_src_tracking[MAXCPU];
 355 struct pf_state_tree_id tree_id[MAXCPU];
 356 struct pf_state_queue state_list[MAXCPU];
 357
 358 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
 359 RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
 360 RB_GENERATE(pf_state_rtree, pf_state_key, entry, pf_state_compare_rkey);
 361 RB_GENERATE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id);
 362
 363 static __inline int
 364 pf_src_compare(struct pf_src_node *a, struct pf_src_node *b)
 365 {
 366         int     diff;
 367
 368         if (a->rule.ptr > b->rule.ptr)
 369                 return (1);
 370         if (a->rule.ptr < b->rule.ptr)
 371                 return (-1);
 372         if ((diff = a->af - b->af) != 0)
 373                 return (diff);
 374         switch (a->af) {
 375 #ifdef INET
 376         case AF_INET:
 377                 if (a->addr.addr32[0] > b->addr.addr32[0])
 378                         return (1);
 379                 if (a->addr.addr32[0] < b->addr.addr32[0])
 380                         return (-1);
 381                 break;
 382 #endif /* INET */
 383 #ifdef INET6
 384         case AF_INET6:
 385                 if (a->addr.addr32[3] > b->addr.addr32[3])
 386                         return (1);
 387                 if (a->addr.addr32[3] < b->addr.addr32[3])
 388                         return (-1);
 389                 if (a->addr.addr32[2] > b->addr.addr32[2])
 390                         return (1);
 391                 if (a->addr.addr32[2] < b->addr.addr32[2])
 392                         return (-1);
 393                 if (a->addr.addr32[1] > b->addr.addr32[1])
 394                         return (1);
 395                 if (a->addr.addr32[1] < b->addr.addr32[1])
 396                         return (-1);
 397                 if (a->addr.addr32[0] > b->addr.addr32[0])
 398                         return (1);
 399                 if (a->addr.addr32[0] < b->addr.addr32[0])
 400                         return (-1);
 401                 break;
 402 #endif /* INET6 */
 403         }
 404         return (0);
 405 }
 406
 407 u_int32_t
 408 pf_state_hash(struct pf_state_key *sk)
 409 {
 410         u_int32_t hv = (u_int32_t)(((intptr_t)sk >> 6) ^ ((intptr_t)sk >> 15));
 411         if (hv == 0)    /* disallow 0 */
 412                 hv = 1;
 413         return(hv);
 414 }
 415
 416 #ifdef INET6
 417 void
 418 pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
 419 {
 420         switch (af) {
 421 #ifdef INET
 422         case AF_INET:
 423                 dst->addr32[0] = src->addr32[0];
 424                 break;
 425 #endif /* INET */
 426         case AF_INET6:
 427                 dst->addr32[0] = src->addr32[0];
 428                 dst->addr32[1] = src->addr32[1];
 429                 dst->addr32[2] = src->addr32[2];
 430                 dst->addr32[3] = src->addr32[3];
 431                 break;
 432         }
 433 }
 434 #endif /* INET6 */
 435
 436 void
 437 pf_init_threshold(struct pf_threshold *threshold,
 438     u_int32_t limit, u_int32_t seconds)
 439 {
 440         threshold->limit = limit * PF_THRESHOLD_MULT;
 441         threshold->seconds = seconds;
 442         threshold->count = 0;
 443         threshold->last = time_second;
 444 }
 445
 446 void
 447 pf_add_threshold(struct pf_threshold *threshold)
 448 {
 449         u_int32_t t = time_second, diff = t - threshold->last;
 450
 451         if (diff >= threshold->seconds)
 452                 threshold->count = 0;
 453         else
 454                 threshold->count -= threshold->count * diff /
 455                     threshold->seconds;
 456         threshold->count += PF_THRESHOLD_MULT;
 457         threshold->last = t;
 458 }
 459
 460 int
 461 pf_check_threshold(struct pf_threshold *threshold)
 462 {
 463         return (threshold->count > threshold->limit);
 464 }
 465
 466 int
 467 pf_src_connlimit(struct pf_state *state)
 468 {
 469         int bad = 0;
 470         int cpu = mycpu->gd_cpuid;
 471
 472         state->src_node->conn++;
 473         state->src.tcp_est = 1;
 474         pf_add_threshold(&state->src_node->conn_rate);
 475
 476         if (state->rule.ptr->max_src_conn &&
 477             state->rule.ptr->max_src_conn <
 478             state->src_node->conn) {
 479                 pf_status.lcounters[LCNT_SRCCONN]++;
 480                 bad++;
 481         }
 482
 483         if (state->rule.ptr->max_src_conn_rate.limit &&
 484             pf_check_threshold(&state->src_node->conn_rate)) {
 485                 pf_status.lcounters[LCNT_SRCCONNRATE]++;
 486                 bad++;
 487         }
 488
 489         if (!bad)
 490                 return 0;
 491
 492         if (state->rule.ptr->overload_tbl) {
 493                 struct pfr_addr p;
 494                 u_int32_t       killed = 0;
 495
 496                 pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
 497                 if (pf_status.debug >= PF_DEBUG_MISC) {
 498                         kprintf("pf_src_connlimit: blocking address ");
 499                         pf_print_host(&state->src_node->addr, 0,
 500                             state->key[PF_SK_WIRE]->af);
 501                 }
 502
 503                 bzero(&p, sizeof(p));
 504                 p.pfra_af = state->key[PF_SK_WIRE]->af;
 505                 switch (state->key[PF_SK_WIRE]->af) {
 506 #ifdef INET
 507                 case AF_INET:
 508                         p.pfra_net = 32;
 509                         p.pfra_ip4addr = state->src_node->addr.v4;
 510                         break;
 511 #endif /* INET */
 512 #ifdef INET6
 513                 case AF_INET6:
 514                         p.pfra_net = 128;
 515                         p.pfra_ip6addr = state->src_node->addr.v6;
 516                         break;
 517 #endif /* INET6 */
 518                 }
 519
 520                 pfr_insert_kentry(state->rule.ptr->overload_tbl,
 521                     &p, time_second);
 522
 523                 /* kill existing states if that's required. */
 524                 if (state->rule.ptr->flush) {
 525                         struct pf_state_key *sk;
 526                         struct pf_state *st;
 527
 528                         pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
 529                         RB_FOREACH(st, pf_state_tree_id, &tree_id[cpu]) {
 530                                 sk = st->key[PF_SK_WIRE];
 531                                 /*
 532                                  * Kill states from this source.  (Only those
 533                                  * from the same rule if PF_FLUSH_GLOBAL is not
 534                                  * set).  (Only on current cpu).
 535                                  */
 536                                 if (sk->af ==
 537                                     state->key[PF_SK_WIRE]->af &&
 538                                     ((state->direction == PF_OUT &&
 539                                     PF_AEQ(&state->src_node->addr,
 540                                         &sk->addr[0], sk->af)) ||
 541                                     (state->direction == PF_IN &&
 542                                     PF_AEQ(&state->src_node->addr,
 543                                         &sk->addr[1], sk->af))) &&
 544                                     (state->rule.ptr->flush &
 545                                     PF_FLUSH_GLOBAL ||
 546                                     state->rule.ptr == st->rule.ptr)) {
 547                                         st->timeout = PFTM_PURGE;
 548                                         st->src.state = st->dst.state =
 549                                             TCPS_CLOSED;
 550                                         killed++;
 551                                 }
 552                         }
 553                         if (pf_status.debug >= PF_DEBUG_MISC)
 554                                 kprintf(", %u states killed", killed);
 555                 }
 556                 if (pf_status.debug >= PF_DEBUG_MISC)
 557                         kprintf("\n");
 558         }
 559
 560         /* kill this state */
 561         state->timeout = PFTM_PURGE;
 562         state->src.state = state->dst.state = TCPS_CLOSED;
 563
 564         return 1;
 565 }
 566
 567 int
 568 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
 569     struct pf_addr *src, sa_family_t af)
 570 {
 571         struct pf_src_node      k;
 572         int cpu = mycpu->gd_cpuid;
 573
 574         bzero(&k, sizeof(k));   /* avoid gcc warnings */
 575         if (*sn == NULL) {
 576                 k.af = af;
 577                 PF_ACPY(&k.addr, src, af);
 578                 if (rule->rule_flag & PFRULE_RULESRCTRACK ||
 579                     rule->rpool.opts & PF_POOL_STICKYADDR)
 580                         k.rule.ptr = rule;
 581                 else
 582                         k.rule.ptr = NULL;
 583                 pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
 584                 *sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
 585         }
 586         if (*sn == NULL) {
 587                 if (!rule->max_src_nodes ||
 588                     rule->src_nodes < rule->max_src_nodes)
 589                         (*sn) = kmalloc(sizeof(struct pf_src_node),
 590                                         M_PFSRCTREEPL, M_NOWAIT|M_ZERO);
 591                 else
 592                         pf_status.lcounters[LCNT_SRCNODES]++;
 593                 if ((*sn) == NULL)
 594                         return (-1);
 595
 596                 pf_init_threshold(&(*sn)->conn_rate,
 597                     rule->max_src_conn_rate.limit,
 598                     rule->max_src_conn_rate.seconds);
 599
 600                 (*sn)->af = af;
 601                 if (rule->rule_flag & PFRULE_RULESRCTRACK ||
 602                     rule->rpool.opts & PF_POOL_STICKYADDR)
 603                         (*sn)->rule.ptr = rule;
 604                 else
 605                         (*sn)->rule.ptr = NULL;
 606                 PF_ACPY(&(*sn)->addr, src, af);
 607                 if (RB_INSERT(pf_src_tree,
 608                     &tree_src_tracking[cpu], *sn) != NULL) {
 609                         if (pf_status.debug >= PF_DEBUG_MISC) {
 610                                 kprintf("pf: src_tree insert failed: ");
 611                                 pf_print_host(&(*sn)->addr, 0, af);
 612                                 kprintf("\n");
 613                         }
 614                         kfree(*sn, M_PFSRCTREEPL);
 615                         return (-1);
 616                 }
 617
 618                 /*
 619                  * Atomic op required to increment src_nodes in the rule
 620                  * because we hold a shared token here (decrements will use
 621                  * an exclusive token).
 622                  */
 623                 (*sn)->creation = time_second;
 624                 (*sn)->ruletype = rule->action;
 625                 if ((*sn)->rule.ptr != NULL)
 626                         atomic_add_int(&(*sn)->rule.ptr->src_nodes, 1);
 627                 pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
 628                 atomic_add_int(&pf_status.src_nodes, 1);
 629         } else {
 630                 if (rule->max_src_states &&
 631                     (*sn)->states >= rule->max_src_states) {
 632                         pf_status.lcounters[LCNT_SRCSTATES]++;
 633                         return (-1);
 634                 }
 635         }
 636         return (0);
 637 }
 638
 639 /*
 640  * state table (indexed by the pf_state_key structure), normal RBTREE
 641  * comparison.
 642  */
 643 static __inline int
 644 pf_state_compare_key(struct pf_state_key *a, struct pf_state_key *b)
 645 {
 646         int     diff;
 647
 648         if ((diff = a->proto - b->proto) != 0)
 649                 return (diff);
 650         if ((diff = a->af - b->af) != 0)
 651                 return (diff);
 652         switch (a->af) {
 653 #ifdef INET
 654         case AF_INET:
 655                 if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
 656                         return (1);
 657                 if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
 658                         return (-1);
 659                 if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
 660                         return (1);
 661                 if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
 662                         return (-1);
 663                 break;
 664 #endif /* INET */
 665 #ifdef INET6
 666         case AF_INET6:
 667                 if (a->addr[0].addr32[3] > b->addr[0].addr32[3])
 668                         return (1);
 669                 if (a->addr[0].addr32[3] < b->addr[0].addr32[3])
 670                         return (-1);
 671                 if (a->addr[1].addr32[3] > b->addr[1].addr32[3])
 672                         return (1);
 673                 if (a->addr[1].addr32[3] < b->addr[1].addr32[3])
 674                         return (-1);
 675                 if (a->addr[0].addr32[2] > b->addr[0].addr32[2])
 676                         return (1);
 677                 if (a->addr[0].addr32[2] < b->addr[0].addr32[2])
 678                         return (-1);
 679                 if (a->addr[1].addr32[2] > b->addr[1].addr32[2])
 680                         return (1);
 681                 if (a->addr[1].addr32[2] < b->addr[1].addr32[2])
 682                         return (-1);
 683                 if (a->addr[0].addr32[1] > b->addr[0].addr32[1])
 684                         return (1);
 685                 if (a->addr[0].addr32[1] < b->addr[0].addr32[1])
 686                         return (-1);
 687                 if (a->addr[1].addr32[1] > b->addr[1].addr32[1])
 688                         return (1);
 689                 if (a->addr[1].addr32[1] < b->addr[1].addr32[1])
 690                         return (-1);
 691                 if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
 692                         return (1);
 693                 if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
 694                         return (-1);
 695                 if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
 696                         return (1);
 697                 if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
 698                         return (-1);
 699                 break;
 700 #endif /* INET6 */
 701         }
 702
 703         if ((diff = a->port[0] - b->port[0]) != 0)
 704                 return (diff);
 705         if ((diff = a->port[1] - b->port[1]) != 0)
 706                 return (diff);
 707
 708         return (0);
 709 }
 710
 711 /*
 712  * Used for RB_FIND only, compare in the reverse direction.  The
 713  * element to be reversed is always (a), since we obviously can't
 714  * reverse the state tree depicted by (b).
 715  */
 716 static __inline int
 717 pf_state_compare_rkey(struct pf_state_key *a, struct pf_state_key *b)
 718 {
 719         int     diff;
 720
 721         if ((diff = a->proto - b->proto) != 0)
 722                 return (diff);
 723         if ((diff = a->af - b->af) != 0)
 724                 return (diff);
 725         switch (a->af) {
 726 #ifdef INET
 727         case AF_INET:
 728                 if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
 729                         return (1);
 730                 if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
 731                         return (-1);
 732                 if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
 733                         return (1);
 734                 if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
 735                         return (-1);
 736                 break;
 737 #endif /* INET */
 738 #ifdef INET6
 739         case AF_INET6:
 740                 if (a->addr[1].addr32[3] > b->addr[0].addr32[3])
 741                         return (1);
 742                 if (a->addr[1].addr32[3] < b->addr[0].addr32[3])
 743                         return (-1);
 744                 if (a->addr[0].addr32[3] > b->addr[1].addr32[3])
 745                         return (1);
 746                 if (a->addr[0].addr32[3] < b->addr[1].addr32[3])
 747                         return (-1);
 748                 if (a->addr[1].addr32[2] > b->addr[0].addr32[2])
 749                         return (1);
 750                 if (a->addr[1].addr32[2] < b->addr[0].addr32[2])
 751                         return (-1);
 752                 if (a->addr[0].addr32[2] > b->addr[1].addr32[2])
 753                         return (1);
 754                 if (a->addr[0].addr32[2] < b->addr[1].addr32[2])
 755                         return (-1);
 756                 if (a->addr[1].addr32[1] > b->addr[0].addr32[1])
 757                         return (1);
 758                 if (a->addr[1].addr32[1] < b->addr[0].addr32[1])
 759                         return (-1);
 760                 if (a->addr[0].addr32[1] > b->addr[1].addr32[1])
 761                         return (1);
 762                 if (a->addr[0].addr32[1] < b->addr[1].addr32[1])
 763                         return (-1);
 764                 if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
 765                         return (1);
 766                 if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
 767                         return (-1);
 768                 if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
 769                         return (1);
 770                 if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
 771                         return (-1);
 772                 break;
 773 #endif /* INET6 */
 774         }
 775
 776         if ((diff = a->port[1] - b->port[0]) != 0)
 777                 return (diff);
 778         if ((diff = a->port[0] - b->port[1]) != 0)
 779                 return (diff);
 780
 781         return (0);
 782 }
 783
 784 static __inline int
 785 pf_state_compare_id(struct pf_state *a, struct pf_state *b)
 786 {
 787         if (a->id > b->id)
 788                 return (1);
 789         if (a->id < b->id)
 790                 return (-1);
 791         if (a->creatorid > b->creatorid)
 792                 return (1);
 793         if (a->creatorid < b->creatorid)
 794                 return (-1);
 795
 796         return (0);
 797 }
 798
 799 int
 800 pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx)
 801 {
 802         struct pf_state_item    *si;
 803         struct pf_state_key     *cur;
 804         int cpu;
 805         int error;
 806
 807         /*
 808          * PFSTATE_STACK_GLOBAL is set when the state might not hash to the
 809          * current cpu.  The keys are managed on the global statetbl tree
 810          * for this case.  Only translations (RDR, NAT) can cause this.
 811          *
 812          * When this flag is not set we must still check the global statetbl
 813          * for a collision, and if we find one we set the HALF_DUPLEX flag
 814          * in the state.
 815          */
 816         if (s->state_flags & PFSTATE_STACK_GLOBAL) {
 817                 cpu = MAXCPU;
 818                 lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
 819         } else {
 820                 cpu = mycpu->gd_cpuid;
 821                 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
 822         }
 823         KKASSERT(s->key[idx] == NULL);  /* XXX handle this? */
 824
 825         if (pf_status.debug >= PF_DEBUG_MISC) {
 826                 kprintf("state_key attach cpu %d (%08x:%d) %s (%08x:%d)\n",
 827                         cpu,
 828                         ntohl(sk->addr[0].addr32[0]), ntohs(sk->port[0]),
 829                         (idx == PF_SK_WIRE ? "->" : "<-"),
 830                         ntohl(sk->addr[1].addr32[0]), ntohs(sk->port[1]));
 831         }
 832
 833         /*
 834          * Check whether (e.g.) a PASS rule being put on a per-cpu tree
 835          * collides with a translation rule on the global tree.  This is
 836          * NOT an error.  We *WANT* to establish state for this case so the
 837          * packet path is short-cutted and doesn't need to scan the ruleset
 838          * on every packet.  But the established state will only see one
 839          * side of a two-way packet conversation.  To prevent this from
 840          * causing problems (e.g. generating a RST), we force PFSTATE_SLOPPY
 841          * to be set on the established state.
 842          *
 843          * A collision against RDR state can only occur with a PASS IN in the
 844          * opposite direction or a PASS OUT in the forwards direction.  This
 845          * is because RDRs are processed on the input side.
 846          *
 847          * A collision against NAT state can only occur with a PASS IN in the
 848          * forwards direction or a PASS OUT in the opposite direction.  This
 849          * is because NATs are processed on the output side.
 850          *
 851          * In both situations we need to do a reverse addr/port test because
 852          * the PASS IN or PASS OUT only establishes if it doesn't match the
 853          * established RDR state in the forwards direction.  The direction
 854          * flag has to be ignored (it will be one way for a PASS IN and the
 855          * other way for a PASS OUT).
 856          *
 857          * pf_global_statetbl_lock will be locked shared when testing and
 858          * not entering into the global state table.
 859          */
 860         if (cpu != MAXCPU &&
 861             (cur = RB_FIND(pf_state_rtree,
 862                            (struct pf_state_rtree *)&pf_statetbl[MAXCPU],
 863                            sk)) != NULL) {
 864                 TAILQ_FOREACH(si, &cur->states, entry) {
 865                         /*
 866                          * NOTE: We must ignore direction mismatches.
 867                          */
 868                         if (si->s->kif == s->kif) {
 869                                 s->state_flags |= PFSTATE_HALF_DUPLEX |
 870                                                   PFSTATE_SLOPPY;
 871                                 if (pf_status.debug >= PF_DEBUG_MISC) {
 872                                         kprintf(
 873                                             "pf: %s key attach collision "
 874                                             "on %s: ",
 875                                             (idx == PF_SK_WIRE) ?
 876                                             "wire" : "stack",
 877                                             s->kif->pfik_name);
 878                                         pf_print_state_parts(s,
 879                                             (idx == PF_SK_WIRE) ? sk : NULL,
 880                                             (idx == PF_SK_STACK) ? sk : NULL);
 881                                         kprintf("\n");
 882                                 }
 883                                 break;
 884                         }
 885                 }
 886         }
 887
 888         /*
 889          * Enter into either the per-cpu or the global state table.
 890          *
 891          * pf_global_statetbl_lock will be locked exclusively when entering
 892          * into the global state table.
 893          */
 894         if ((cur = RB_INSERT(pf_state_tree, &pf_statetbl[cpu], sk)) != NULL) {
 895                 /* key exists. check for same kif, if none, add to key */
 896                 TAILQ_FOREACH(si, &cur->states, entry) {
 897                         if (si->s->kif == s->kif &&
 898                             si->s->direction == s->direction) {
 899                                 if (pf_status.debug >= PF_DEBUG_MISC) {
 900                                         kprintf(
 901                                             "pf: %s key attach failed on %s: ",
 902                                             (idx == PF_SK_WIRE) ?
 903                                             "wire" : "stack",
 904                                             s->kif->pfik_name);
 905                                         pf_print_state_parts(s,
 906                                             (idx == PF_SK_WIRE) ? sk : NULL,
 907                                             (idx == PF_SK_STACK) ? sk : NULL);
 908                                         kprintf("\n");
 909                                 }
 910                                 kfree(sk, M_PFSTATEKEYPL);
 911                                 error = -1;
 912                                 goto failed;    /* collision! */
 913                         }
 914                 }
 915                 kfree(sk, M_PFSTATEKEYPL);
 916
 917                 s->key[idx] = cur;
 918         } else {
 919                 s->key[idx] = sk;
 920         }
 921
 922         if ((si = kmalloc(sizeof(struct pf_state_item),
 923                           M_PFSTATEITEMPL, M_NOWAIT)) == NULL) {
 924                 pf_state_key_detach(s, idx);
 925                 error = -1;
 926                 goto failed;    /* collision! */
 927         }
 928         si->s = s;
 929
 930         /* list is sorted, if-bound states before floating */
 931         if (s->kif == pfi_all)
 932                 TAILQ_INSERT_TAIL(&s->key[idx]->states, si, entry);
 933         else
 934                 TAILQ_INSERT_HEAD(&s->key[idx]->states, si, entry);
 935
 936         error = 0;
 937 failed:
 938         lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
 939         return error;
 940 }
 941
 942 /*
 943  * NOTE: Can only be called indirectly via the purge thread with pf_token
 944  *       exclusively locked.
 945  */
 946 void
 947 pf_detach_state(struct pf_state *s)
 948 {
 949         if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
 950                 s->key[PF_SK_WIRE] = NULL;
 951
 952         if (s->key[PF_SK_STACK] != NULL)
 953                 pf_state_key_detach(s, PF_SK_STACK);
 954
 955         if (s->key[PF_SK_WIRE] != NULL)
 956                 pf_state_key_detach(s, PF_SK_WIRE);
 957 }
 958
 959 /*
 960  * NOTE: Can only be called indirectly via the purge thread with pf_token
 961  *       exclusively locked.
 962  */
 963 void
 964 pf_state_key_detach(struct pf_state *s, int idx)
 965 {
 966         struct pf_state_item    *si;
 967         int cpu;
 968
 969         /*
 970          * PFSTATE_STACK_GLOBAL is set for translations when the translated
 971          * address/port is not localized to the same cpu that the untranslated
 972          * address/port is on.  The wire pf_state_key is managed on the global
 973          * statetbl tree for this case.
 974          */
 975         if (s->state_flags & PFSTATE_STACK_GLOBAL) {
 976                 cpu = MAXCPU;
 977                 lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
 978         } else {
 979                 cpu = mycpu->gd_cpuid;
 980         }
 981
 982         si = TAILQ_FIRST(&s->key[idx]->states);
 983         while (si && si->s != s)
 984                 si = TAILQ_NEXT(si, entry);
 985
 986         if (si) {
 987                 TAILQ_REMOVE(&s->key[idx]->states, si, entry);
 988                 kfree(si, M_PFSTATEITEMPL);
 989         }
 990
 991         if (TAILQ_EMPTY(&s->key[idx]->states)) {
 992                 RB_REMOVE(pf_state_tree, &pf_statetbl[cpu], s->key[idx]);
 993                 if (s->key[idx]->reverse)
 994                         s->key[idx]->reverse->reverse = NULL;
 995                 if (s->key[idx]->inp)
 996                         s->key[idx]->inp->inp_pf_sk = NULL;
 997                 kfree(s->key[idx], M_PFSTATEKEYPL);
 998         }
 999         s->key[idx] = NULL;
1000
1001         if (s->state_flags & PFSTATE_STACK_GLOBAL)
1002                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1003 }
1004
1005 struct pf_state_key *
1006 pf_alloc_state_key(int pool_flags)
1007 {
1008         struct pf_state_key     *sk;
1009
1010         sk = kmalloc(sizeof(struct pf_state_key), M_PFSTATEKEYPL, pool_flags);
1011         if (sk) {
1012                 TAILQ_INIT(&sk->states);
1013         }
1014         return (sk);
1015 }
1016
1017 int
1018 pf_state_key_setup(struct pf_pdesc *pd, struct pf_rule *nr,
1019         struct pf_state_key **skw, struct pf_state_key **sks,
1020         struct pf_state_key **skp, struct pf_state_key **nkp,
1021         struct pf_addr *saddr, struct pf_addr *daddr,
1022         u_int16_t sport, u_int16_t dport)
1023 {
1024         KKASSERT((*skp == NULL && *nkp == NULL));
1025
1026         if ((*skp = pf_alloc_state_key(M_NOWAIT | M_ZERO)) == NULL)
1027                 return (ENOMEM);
1028
1029         PF_ACPY(&(*skp)->addr[pd->sidx], saddr, pd->af);
1030         PF_ACPY(&(*skp)->addr[pd->didx], daddr, pd->af);
1031         (*skp)->port[pd->sidx] = sport;
1032         (*skp)->port[pd->didx] = dport;
1033         (*skp)->proto = pd->proto;
1034         (*skp)->af = pd->af;
1035
1036         if (nr != NULL) {
1037                 if ((*nkp = pf_alloc_state_key(M_NOWAIT | M_ZERO)) == NULL)
1038                         return (ENOMEM); /* caller must handle cleanup */
1039
1040                 /* XXX maybe just bcopy and TAILQ_INIT(&(*nkp)->states) */
1041                 PF_ACPY(&(*nkp)->addr[0], &(*skp)->addr[0], pd->af);
1042                 PF_ACPY(&(*nkp)->addr[1], &(*skp)->addr[1], pd->af);
1043                 (*nkp)->port[0] = (*skp)->port[0];
1044                 (*nkp)->port[1] = (*skp)->port[1];
1045                 (*nkp)->proto = pd->proto;
1046                 (*nkp)->af = pd->af;
1047         } else {
1048                 *nkp = *skp;
1049         }
1050
1051         if (pd->dir == PF_IN) {
1052                 *skw = *skp;
1053                 *sks = *nkp;
1054         } else {
1055                 *sks = *skp;
1056                 *skw = *nkp;
1057         }
1058         return (0);
1059 }
1060
1061 /*
1062  * Insert pf_state with one or two state keys (allowing a reverse path lookup
1063  * which is used by NAT).  In the NAT case skw is the initiator (?) and
1064  * sks is the target.
1065  */
1066 int
1067 pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1068                 struct pf_state_key *sks, struct pf_state *s)
1069 {
1070         int cpu = mycpu->gd_cpuid;
1071
1072         s->kif = kif;
1073         s->cpuid = cpu;
1074
1075         if (skw == sks) {
1076                 if (pf_state_key_attach(skw, s, PF_SK_WIRE))
1077                         return (-1);
1078                 s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1079         } else {
1080                 /*
1081                 skw->reverse = sks;
1082                 sks->reverse = skw;
1083                 */
1084                 if (pf_state_key_attach(skw, s, PF_SK_WIRE)) {
1085                         kfree(sks, M_PFSTATEKEYPL);
1086                         return (-1);
1087                 }
1088                 if (pf_state_key_attach(sks, s, PF_SK_STACK)) {
1089                         pf_state_key_detach(s, PF_SK_WIRE);
1090                         return (-1);
1091                 }
1092         }
1093
1094         if (s->id == 0 && s->creatorid == 0) {
1095                 u_int64_t sid;
1096
1097 #if __SIZEOF_LONG__ == 8
1098                 sid = atomic_fetchadd_long(&pf_status.stateid, 1);
1099 #else
1100                 spin_lock(&pf_spin);
1101                 sid = pf_status.stateid++;
1102                 spin_unlock(&pf_spin);
1103 #endif
1104                 s->id = htobe64(sid);
1105                 s->creatorid = pf_status.hostid;
1106         }
1107
1108         /*
1109          * Calculate hash code for altq
1110          */
1111         s->hash = crc32(s->key[PF_SK_WIRE], PF_STATE_KEY_HASH_LENGTH);
1112
1113         if (RB_INSERT(pf_state_tree_id, &tree_id[cpu], s) != NULL) {
1114                 if (pf_status.debug >= PF_DEBUG_MISC) {
1115                         kprintf("pf: state insert failed: "
1116                             "id: %016jx creatorid: %08x",
1117                               (uintmax_t)be64toh(s->id), ntohl(s->creatorid));
1118                         if (s->sync_flags & PFSTATE_FROMSYNC)
1119                                 kprintf(" (from sync)");
1120                         kprintf("\n");
1121                 }
1122                 pf_detach_state(s);
1123                 return (-1);
1124         }
1125         TAILQ_INSERT_TAIL(&state_list[cpu], s, entry_list);
1126         pf_status.fcounters[FCNT_STATE_INSERT]++;
1127         atomic_add_int(&pf_status.states, 1);
1128         pfi_kif_ref(kif, PFI_KIF_REF_STATE);
1129         pfsync_insert_state(s);
1130         return (0);
1131 }
1132
1133 struct pf_state *
1134 pf_find_state_byid(struct pf_state_cmp *key)
1135 {
1136         int cpu = mycpu->gd_cpuid;
1137
1138         pf_status.fcounters[FCNT_STATE_SEARCH]++;
1139
1140         return (RB_FIND(pf_state_tree_id, &tree_id[cpu],
1141                         (struct pf_state *)key));
1142 }
1143
1144 /*
1145  * WARNING! May return a state structure that was localized to another cpu,
1146  *          destruction is typically protected by the callers pf_token.
1147  *          The element can only be destroyed
1148  */
1149 struct pf_state *
1150 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir,
1151               struct mbuf *m)
1152 {
1153         struct pf_state_key     *skey = (void *)key;
1154         struct pf_state_key     *sk;
1155         struct pf_state_item    *si;
1156         struct pf_state *s;
1157         int cpu = mycpu->gd_cpuid;
1158         int globalstl = 0;
1159
1160         pf_status.fcounters[FCNT_STATE_SEARCH]++;
1161
1162         if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
1163             ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse) {
1164                 sk = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse;
1165         } else {
1166                 sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
1167                 if (sk == NULL) {
1168                         lockmgr(&pf_global_statetbl_lock, LK_SHARED);
1169                         sk = RB_FIND(pf_state_tree, &pf_statetbl[MAXCPU], skey);
1170                         if (sk == NULL) {
1171                                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1172                                 return (NULL);
1173                         }
1174                         globalstl = 1;
1175                 }
1176                 if (dir == PF_OUT && m->m_pkthdr.pf.statekey) {
1177                         ((struct pf_state_key *)
1178                             m->m_pkthdr.pf.statekey)->reverse = sk;
1179                         sk->reverse = m->m_pkthdr.pf.statekey;
1180                 }
1181         }
1182         if (dir == PF_OUT)
1183                 m->m_pkthdr.pf.statekey = NULL;
1184
1185         /* list is sorted, if-bound states before floating ones */
1186         TAILQ_FOREACH(si, &sk->states, entry) {
1187                 if ((si->s->kif == pfi_all || si->s->kif == kif) &&
1188                     sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
1189                                           si->s->key[PF_SK_STACK])) {
1190                         break;
1191                 }
1192         }
1193
1194         /*
1195          * Extract state before potentially releasing the global statetbl
1196          * lock.  Ignore the state if the create is still in-progress as
1197          * it can be deleted out from under us by the owning localized cpu.
1198          * However, if CREATEINPROG is not set, state can only be deleted
1199          * by the purge thread which we are protected from via our shared
1200          * pf_token.
1201          */
1202         if (si) {
1203                 s = si->s;
1204                 if (s && (s->state_flags & PFSTATE_CREATEINPROG))
1205                         s = NULL;
1206         } else {
1207                 s = NULL;
1208         }
1209         if (globalstl)
1210                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1211         return s;
1212 }
1213
1214 /*
1215  * WARNING! May return a state structure that was localized to another cpu,
1216  *          destruction is typically protected by the callers pf_token.
1217  */
1218 struct pf_state *
1219 pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1220 {
1221         struct pf_state_key     *skey = (void *)key;
1222         struct pf_state_key     *sk;
1223         struct pf_state_item    *si, *ret = NULL;
1224         struct pf_state         *s;
1225         int cpu = mycpu->gd_cpuid;
1226         int globalstl = 0;
1227
1228         pf_status.fcounters[FCNT_STATE_SEARCH]++;
1229
1230         sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
1231         if (sk == NULL) {
1232                 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
1233                 sk = RB_FIND(pf_state_tree, &pf_statetbl[MAXCPU], skey);
1234                 globalstl = 1;
1235         }
1236         if (sk != NULL) {
1237                 TAILQ_FOREACH(si, &sk->states, entry)
1238                         if (dir == PF_INOUT ||
1239                             (sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
1240                             si->s->key[PF_SK_STACK]))) {
1241                                 if (more == NULL) {
1242                                         ret = si;
1243                                         break;
1244                                 }
1245                                 if (ret)
1246                                         (*more)++;
1247                                 else
1248                                         ret = si;
1249                         }
1250         }
1251
1252         /*
1253          * Extract state before potentially releasing the global statetbl
1254          * lock.  Ignore the state if the create is still in-progress as
1255          * it can be deleted out from under us by the owning localized cpu.
1256          * However, if CREATEINPROG is not set, state can only be deleted
1257          * by the purge thread which we are protected from via our shared
1258          * pf_token.
1259          */
1260         if (ret) {
1261                 s = ret->s;
1262                 if (s && (s->state_flags & PFSTATE_CREATEINPROG))
1263                         s = NULL;
1264         } else {
1265                 s = NULL;
1266         }
1267         if (globalstl)
1268                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1269         return s;
1270 }
1271
1272 /* END state table stuff */
1273
1274 void
1275 pf_purge_thread(void *v)
1276 {
1277         globaldata_t save_gd = mycpu;
1278         int nloops = 0;
1279         int locked = 0;
1280         int nn;
1281         int endingit;
1282
1283         for (;;) {
1284                 tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);
1285
1286                 endingit = pf_end_threads;
1287
1288                 for (nn = 0; nn < ncpus; ++nn) {
1289                         lwkt_setcpu_self(globaldata_find(nn));
1290
1291                         lwkt_gettoken(&pf_token);
1292                         lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1293                         crit_enter();
1294
1295                         /*
1296                          * process a fraction of the state table every second
1297                          */
1298                         if(!pf_purge_expired_states(
1299                                 1 + (pf_status.states /
1300                                      pf_default_rule.timeout[
1301                                         PFTM_INTERVAL]), 0)) {
1302                                 pf_purge_expired_states(
1303                                         1 + (pf_status.states /
1304                                              pf_default_rule.timeout[
1305                                                 PFTM_INTERVAL]), 1);
1306                         }
1307
1308                         /*
1309                          * purge other expired types every PFTM_INTERVAL
1310                          * seconds
1311                          */
1312                         if (++nloops >=
1313                             pf_default_rule.timeout[PFTM_INTERVAL]) {
1314                                 pf_purge_expired_fragments();
1315                                 if (!pf_purge_expired_src_nodes(locked)) {
1316                                         pf_purge_expired_src_nodes(1);
1317                                 }
1318                                 nloops = 0;
1319                         }
1320
1321                         /*
1322                          * If terminating the thread, clean everything out
1323                          * (on all cpus).
1324                          */
1325                         if (endingit) {
1326                                 pf_purge_expired_states(pf_status.states, 0);
1327                                 pf_purge_expired_fragments();
1328                                 pf_purge_expired_src_nodes(1);
1329                         }
1330
1331                         crit_exit();
1332                         lockmgr(&pf_consistency_lock, LK_RELEASE);
1333                         lwkt_reltoken(&pf_token);
1334                 }
1335                 lwkt_setcpu_self(save_gd);
1336                 if (endingit)
1337                         break;
1338         }
1339
1340         /*
1341          * Thread termination
1342          */
1343         pf_end_threads++;
1344         wakeup(pf_purge_thread);
1345         kthread_exit();
1346 }
1347
1348 u_int32_t
1349 pf_state_expires(const struct pf_state *state)
1350 {
1351         u_int32_t       timeout;
1352         u_int32_t       start;
1353         u_int32_t       end;
1354         u_int32_t       states;
1355
1356         /* handle all PFTM_* > PFTM_MAX here */
1357         if (state->timeout == PFTM_PURGE)
1358                 return (time_second);
1359         if (state->timeout == PFTM_UNTIL_PACKET)
1360                 return (0);
1361         KKASSERT(state->timeout != PFTM_UNLINKED);
1362         KKASSERT(state->timeout < PFTM_MAX);
1363         timeout = state->rule.ptr->timeout[state->timeout];
1364         if (!timeout)
1365                 timeout = pf_default_rule.timeout[state->timeout];
1366         start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1367         if (start) {
1368                 end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1369                 states = state->rule.ptr->states_cur;
1370         } else {
1371                 start = pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1372                 end = pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1373                 states = pf_status.states;
1374         }
1375         if (end && states > start && start < end) {
1376                 if (states < end)
1377                         return (state->expire + timeout * (end - states) /
1378                             (end - start));
1379                 else
1380                         return (time_second);
1381         }
1382         return (state->expire + timeout);
1383 }
1384
1385 /*
1386  * (called with exclusive pf_token)
1387  */
1388 int
1389 pf_purge_expired_src_nodes(int waslocked)
1390 {
1391         struct pf_src_node *cur, *next;
1392         int locked = waslocked;
1393         int cpu = mycpu->gd_cpuid;
1394
1395         for (cur = RB_MIN(pf_src_tree, &tree_src_tracking[cpu]);
1396              cur;
1397              cur = next) {
1398                 next = RB_NEXT(pf_src_tree, &tree_src_tracking[cpu], cur);
1399
1400                 if (cur->states <= 0 && cur->expire <= time_second) {
1401                          if (!locked) {
1402                                  lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1403                                  next = RB_NEXT(pf_src_tree,
1404                                      &tree_src_tracking[cpu], cur);
1405                                  locked = 1;
1406                          }
1407                          if (cur->rule.ptr != NULL) {
1408                                 /*
1409                                  * decrements in rule should be ok, token is
1410                                  * held exclusively in this code path.
1411                                  */
1412                                  cur->rule.ptr->src_nodes--;
1413                                  if (cur->rule.ptr->states_cur <= 0 &&
1414                                      cur->rule.ptr->max_src_nodes <= 0)
1415                                          pf_rm_rule(NULL, cur->rule.ptr);
1416                          }
1417                          RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], cur);
1418                          pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
1419                          atomic_add_int(&pf_status.src_nodes, -1);
1420                          kfree(cur, M_PFSRCTREEPL);
1421                 }
1422         }
1423         if (locked && !waslocked)
1424                 lockmgr(&pf_consistency_lock, LK_RELEASE);
1425         return(1);
1426 }
1427
1428 void
1429 pf_src_tree_remove_state(struct pf_state *s)
1430 {
1431         u_int32_t timeout;
1432
1433         if (s->src_node != NULL) {
1434                 if (s->src.tcp_est)
1435                         --s->src_node->conn;
1436                 if (--s->src_node->states <= 0) {
1437                         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1438                         if (!timeout) {
1439                                 timeout =
1440                                     pf_default_rule.timeout[PFTM_SRC_NODE];
1441                         }
1442                         s->src_node->expire = time_second + timeout;
1443                 }
1444         }
1445         if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1446                 if (--s->nat_src_node->states <= 0) {
1447                         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1448                         if (!timeout)
1449                                 timeout =
1450                                     pf_default_rule.timeout[PFTM_SRC_NODE];
1451                         s->nat_src_node->expire = time_second + timeout;
1452                 }
1453         }
1454         s->src_node = s->nat_src_node = NULL;
1455 }
1456
1457 /* callers should be at crit_enter() */
1458 void
1459 pf_unlink_state(struct pf_state *cur)
1460 {
1461         int cpu = mycpu->gd_cpuid;
1462
1463         if (cur->src.state == PF_TCPS_PROXY_DST) {
1464                 /* XXX wire key the right one? */
1465                 pf_send_tcp(cur->rule.ptr, cur->key[PF_SK_WIRE]->af,
1466                     &cur->key[PF_SK_WIRE]->addr[1],
1467                     &cur->key[PF_SK_WIRE]->addr[0],
1468                     cur->key[PF_SK_WIRE]->port[1],
1469                     cur->key[PF_SK_WIRE]->port[0],
1470                     cur->src.seqhi, cur->src.seqlo + 1,
1471                     TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL);
1472         }
1473         RB_REMOVE(pf_state_tree_id, &tree_id[cpu], cur);
1474         if (cur->creatorid == pf_status.hostid)
1475                 pfsync_delete_state(cur);
1476         cur->timeout = PFTM_UNLINKED;
1477         pf_src_tree_remove_state(cur);
1478         pf_detach_state(cur);
1479 }
1480
1481 static struct pf_state  *purge_cur[MAXCPU];
1482
1483 /*
1484  * callers should be at crit_enter() and hold pf_consistency_lock exclusively.
1485  * pf_token must also be held exclusively.
1486  */
1487 void
1488 pf_free_state(struct pf_state *cur)
1489 {
1490         int cpu = mycpu->gd_cpuid;
1491
1492         KKASSERT(cur->cpuid == cpu);
1493
1494         if (pfsyncif != NULL &&
1495             (pfsyncif->sc_bulk_send_next == cur ||
1496             pfsyncif->sc_bulk_terminator == cur))
1497                 return;
1498         KKASSERT(cur->timeout == PFTM_UNLINKED);
1499         /*
1500          * decrements in rule should be ok, token is
1501          * held exclusively in this code path.
1502          */
1503         if (--cur->rule.ptr->states_cur <= 0 &&
1504             cur->rule.ptr->src_nodes <= 0)
1505                 pf_rm_rule(NULL, cur->rule.ptr);
1506         if (cur->nat_rule.ptr != NULL) {
1507                 if (--cur->nat_rule.ptr->states_cur <= 0 &&
1508                         cur->nat_rule.ptr->src_nodes <= 0) {
1509                         pf_rm_rule(NULL, cur->nat_rule.ptr);
1510                 }
1511         }
1512         if (cur->anchor.ptr != NULL) {
1513                 if (--cur->anchor.ptr->states_cur <= 0)
1514                         pf_rm_rule(NULL, cur->anchor.ptr);
1515         }
1516         pf_normalize_tcp_cleanup(cur);
1517         pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
1518
1519         /*
1520          * We may be freeing pf_purge_expired_states()'s saved scan entry,
1521          * adjust it if necessary.
1522          */
1523         if (purge_cur[cpu] == cur) {
1524                 kprintf("PURGE CONFLICT\n");
1525                 purge_cur[cpu] = TAILQ_NEXT(purge_cur[cpu], entry_list);
1526         }
1527         TAILQ_REMOVE(&state_list[cpu], cur, entry_list);
1528         if (cur->tag)
1529                 pf_tag_unref(cur->tag);
1530         kfree(cur, M_PFSTATEPL);
1531         pf_status.fcounters[FCNT_STATE_REMOVALS]++;
1532         atomic_add_int(&pf_status.states, -1);
1533 }
1534
1535 int
1536 pf_purge_expired_states(u_int32_t maxcheck, int waslocked)
1537 {
1538         struct pf_state         *cur;
1539         int locked = waslocked;
1540         int cpu = mycpu->gd_cpuid;
1541
1542         while (maxcheck--) {
1543                 /*
1544                  * Wrap to start of list when we hit the end
1545                  */
1546                 cur = purge_cur[cpu];
1547                 if (cur == NULL) {
1548                         cur = TAILQ_FIRST(&state_list[cpu]);
1549                         if (cur == NULL)
1550                                 break;  /* list empty */
1551                 }
1552
1553                 /*
1554                  * Setup next (purge_cur) while we process this one.  If
1555                  * we block and something else deletes purge_cur,
1556                  * pf_free_state() will adjust it further ahead.
1557                  */
1558                 purge_cur[cpu] = TAILQ_NEXT(cur, entry_list);
1559
1560                 if (cur->timeout == PFTM_UNLINKED) {
1561                         /* free unlinked state */
1562                         if (! locked) {
1563                                 lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1564                                 locked = 1;
1565                         }
1566                         pf_free_state(cur);
1567                 } else if (pf_state_expires(cur) <= time_second) {
1568                         /* unlink and free expired state */
1569                         pf_unlink_state(cur);
1570                         if (! locked) {
1571                                 if (!lockmgr(&pf_consistency_lock, LK_EXCLUSIVE))
1572                                         return (0);
1573                                 locked = 1;
1574                         }
1575                         pf_free_state(cur);
1576                 }
1577         }
1578
1579         if (locked)
1580                 lockmgr(&pf_consistency_lock, LK_RELEASE);
1581         return (1);
1582 }
1583
1584 int
1585 pf_tbladdr_setup(struct pf_ruleset *rs, struct pf_addr_wrap *aw)
1586 {
1587         if (aw->type != PF_ADDR_TABLE)
1588                 return (0);
1589         if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL)
1590                 return (1);
1591         return (0);
1592 }
1593
1594 void
1595 pf_tbladdr_remove(struct pf_addr_wrap *aw)
1596 {
1597         if (aw->type != PF_ADDR_TABLE || aw->p.tbl == NULL)
1598                 return;
1599         pfr_detach_table(aw->p.tbl);
1600         aw->p.tbl = NULL;
1601 }
1602
1603 void
1604 pf_tbladdr_copyout(struct pf_addr_wrap *aw)
1605 {
1606         struct pfr_ktable *kt = aw->p.tbl;
1607
1608         if (aw->type != PF_ADDR_TABLE || kt == NULL)
1609                 return;
1610         if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
1611                 kt = kt->pfrkt_root;
1612         aw->p.tbl = NULL;
1613         aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
1614                 kt->pfrkt_cnt : -1;
1615 }
1616
1617 void
1618 pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1619 {
1620         switch (af) {
1621 #ifdef INET
1622         case AF_INET: {
1623                 u_int32_t a = ntohl(addr->addr32[0]);
1624                 kprintf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1625                     (a>>8)&255, a&255);
1626                 if (p) {
1627                         p = ntohs(p);
1628                         kprintf(":%u", p);
1629                 }
1630                 break;
1631         }
1632 #endif /* INET */
1633 #ifdef INET6
1634         case AF_INET6: {
1635                 u_int16_t b;
1636                 u_int8_t i, curstart, curend, maxstart, maxend;
1637                 curstart = curend = maxstart = maxend = 255;
1638                 for (i = 0; i < 8; i++) {
1639                         if (!addr->addr16[i]) {
1640                                 if (curstart == 255)
1641                                         curstart = i;
1642                                 curend = i;
1643                         } else {
1644                                 if ((curend - curstart) >
1645                                     (maxend - maxstart)) {
1646                                         maxstart = curstart;
1647                                         maxend = curend;
1648                                 }
1649                                 curstart = curend = 255;
1650                         }
1651                 }
1652                 if ((curend - curstart) >
1653                     (maxend - maxstart)) {
1654                         maxstart = curstart;
1655                         maxend = curend;
1656                 }
1657                 for (i = 0; i < 8; i++) {
1658                         if (i >= maxstart && i <= maxend) {
1659                                 if (i == 0)
1660                                         kprintf(":");
1661                                 if (i == maxend)
1662                                         kprintf(":");
1663                         } else {
1664                                 b = ntohs(addr->addr16[i]);
1665                                 kprintf("%x", b);
1666                                 if (i < 7)
1667                                         kprintf(":");
1668                         }
1669                 }
1670                 if (p) {
1671                         p = ntohs(p);
1672                         kprintf("[%u]", p);
1673                 }
1674                 break;
1675         }
1676 #endif /* INET6 */
1677         }
1678 }
1679
1680 void
1681 pf_print_state(struct pf_state *s)
1682 {
1683         pf_print_state_parts(s, NULL, NULL);
1684 }
1685
1686 void
1687 pf_print_state_parts(struct pf_state *s,
1688     struct pf_state_key *skwp, struct pf_state_key *sksp)
1689 {
1690         struct pf_state_key *skw, *sks;
1691         u_int8_t proto, dir;
1692
1693         /* Do our best to fill these, but they're skipped if NULL */
1694         skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1695         sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1696         proto = skw ? skw->proto : (sks ? sks->proto : 0);
1697         dir = s ? s->direction : 0;
1698
1699         switch (proto) {
1700         case IPPROTO_TCP:
1701                 kprintf("TCP ");
1702                 break;
1703         case IPPROTO_UDP:
1704                 kprintf("UDP ");
1705                 break;
1706         case IPPROTO_ICMP:
1707                 kprintf("ICMP ");
1708                 break;
1709         case IPPROTO_ICMPV6:
1710                 kprintf("ICMPV6 ");
1711                 break;
1712         default:
1713                 kprintf("%u ", skw->proto);
1714                 break;
1715         }
1716         switch (dir) {
1717         case PF_IN:
1718                 kprintf(" in");
1719                 break;
1720         case PF_OUT:
1721                 kprintf(" out");
1722                 break;
1723         }
1724         if (skw) {
1725                 kprintf(" wire: ");
1726                 pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1727                 kprintf(" ");
1728                 pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1729         }
1730         if (sks) {
1731                 kprintf(" stack: ");
1732                 if (sks != skw) {
1733                         pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1734                         kprintf(" ");
1735                         pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1736                 } else
1737                         kprintf("-");
1738         }
1739         if (s) {
1740                 if (proto == IPPROTO_TCP) {
1741                         kprintf(" [lo=%u high=%u win=%u modulator=%u",
1742                             s->src.seqlo, s->src.seqhi,
1743                             s->src.max_win, s->src.seqdiff);
1744                         if (s->src.wscale && s->dst.wscale)
1745                                 kprintf(" wscale=%u",
1746                                     s->src.wscale & PF_WSCALE_MASK);
1747                         kprintf("]");
1748                         kprintf(" [lo=%u high=%u win=%u modulator=%u",
1749                             s->dst.seqlo, s->dst.seqhi,
1750                             s->dst.max_win, s->dst.seqdiff);
1751                         if (s->src.wscale && s->dst.wscale)
1752                                 kprintf(" wscale=%u",
1753                                 s->dst.wscale & PF_WSCALE_MASK);
1754                         kprintf("]");
1755                 }
1756                 kprintf(" %u:%u", s->src.state, s->dst.state);
1757         }
1758 }
1759
1760 void
1761 pf_print_flags(u_int8_t f)
1762 {
1763         if (f)
1764                 kprintf(" ");
1765         if (f & TH_FIN)
1766                 kprintf("F");
1767         if (f & TH_SYN)
1768                 kprintf("S");
1769         if (f & TH_RST)
1770                 kprintf("R");
1771         if (f & TH_PUSH)
1772                 kprintf("P");
1773         if (f & TH_ACK)
1774                 kprintf("A");
1775         if (f & TH_URG)
1776                 kprintf("U");
1777         if (f & TH_ECE)
1778                 kprintf("E");
1779         if (f & TH_CWR)
1780                 kprintf("W");
1781 }
1782
1783 #define PF_SET_SKIP_STEPS(i)                                    \
1784         do {                                                    \
1785                 while (head[i] != cur) {                        \
1786                         head[i]->skip[i].ptr = cur;             \
1787                         head[i] = TAILQ_NEXT(head[i], entries); \
1788                 }                                               \
1789         } while (0)
1790
1791 void
1792 pf_calc_skip_steps(struct pf_rulequeue *rules)
1793 {
1794         struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1795         int i;
1796
1797         cur = TAILQ_FIRST(rules);
1798         prev = cur;
1799         for (i = 0; i < PF_SKIP_COUNT; ++i)
1800                 head[i] = cur;
1801         while (cur != NULL) {
1802
1803                 if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1804                         PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1805                 if (cur->direction != prev->direction)
1806                         PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1807                 if (cur->af != prev->af)
1808                         PF_SET_SKIP_STEPS(PF_SKIP_AF);
1809                 if (cur->proto != prev->proto)
1810                         PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1811                 if (cur->src.neg != prev->src.neg ||
1812                     pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1813                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1814                 if (cur->src.port[0] != prev->src.port[0] ||
1815                     cur->src.port[1] != prev->src.port[1] ||
1816                     cur->src.port_op != prev->src.port_op)
1817                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1818                 if (cur->dst.neg != prev->dst.neg ||
1819                     pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1820                         PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1821                 if (cur->dst.port[0] != prev->dst.port[0] ||
1822                     cur->dst.port[1] != prev->dst.port[1] ||
1823                     cur->dst.port_op != prev->dst.port_op)
1824                         PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1825
1826                 prev = cur;
1827                 cur = TAILQ_NEXT(cur, entries);
1828         }
1829         for (i = 0; i < PF_SKIP_COUNT; ++i)
1830                 PF_SET_SKIP_STEPS(i);
1831 }
1832
1833 int
1834 pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1835 {
1836         if (aw1->type != aw2->type)
1837                 return (1);
1838         switch (aw1->type) {
1839         case PF_ADDR_ADDRMASK:
1840         case PF_ADDR_RANGE:
1841                 if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1842                         return (1);
1843                 if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1844                         return (1);
1845                 return (0);
1846         case PF_ADDR_DYNIFTL:
1847                 return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1848         case PF_ADDR_NOROUTE:
1849         case PF_ADDR_URPFFAILED:
1850                 return (0);
1851         case PF_ADDR_TABLE:
1852                 return (aw1->p.tbl != aw2->p.tbl);
1853         case PF_ADDR_RTLABEL:
1854                 return (aw1->v.rtlabel != aw2->v.rtlabel);
1855         default:
1856                 kprintf("invalid address type: %d\n", aw1->type);
1857                 return (1);
1858         }
1859 }
1860
1861 u_int16_t
1862 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1863 {
1864         u_int32_t       l;
1865
1866         if (udp && !cksum)
1867                 return (0x0000);
1868         l = cksum + old - new;
1869         l = (l >> 16) + (l & 65535);
1870         l = l & 65535;
1871         if (udp && !l)
1872                 return (0xFFFF);
1873         return (l);
1874 }
1875
1876 void
1877 pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1878     struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1879 {
1880         struct pf_addr  ao;
1881         u_int16_t       po = *p;
1882
1883         PF_ACPY(&ao, a, af);
1884         PF_ACPY(a, an, af);
1885
1886         *p = pn;
1887
1888         switch (af) {
1889 #ifdef INET
1890         case AF_INET:
1891                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1892                     ao.addr16[0], an->addr16[0], 0),
1893                     ao.addr16[1], an->addr16[1], 0);
1894                 *p = pn;
1895                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1896                     ao.addr16[0], an->addr16[0], u),
1897                     ao.addr16[1], an->addr16[1], u),
1898                     po, pn, u);
1899                 break;
1900 #endif /* INET */
1901 #ifdef INET6
1902         case AF_INET6:
1903                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1904                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1905                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1906                     ao.addr16[0], an->addr16[0], u),
1907                     ao.addr16[1], an->addr16[1], u),
1908                     ao.addr16[2], an->addr16[2], u),
1909                     ao.addr16[3], an->addr16[3], u),
1910                     ao.addr16[4], an->addr16[4], u),
1911                     ao.addr16[5], an->addr16[5], u),
1912                     ao.addr16[6], an->addr16[6], u),
1913                     ao.addr16[7], an->addr16[7], u),
1914                     po, pn, u);
1915                 break;
1916 #endif /* INET6 */
1917         }
1918 }
1919
1920
1921 /* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
1922 void
1923 pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
1924 {
1925         u_int32_t       ao;
1926
1927         memcpy(&ao, a, sizeof(ao));
1928         memcpy(a, &an, sizeof(u_int32_t));
1929         *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
1930             ao % 65536, an % 65536, u);
1931 }
1932
1933 #ifdef INET6
1934 void
1935 pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
1936 {
1937         struct pf_addr  ao;
1938
1939         PF_ACPY(&ao, a, AF_INET6);
1940         PF_ACPY(a, an, AF_INET6);
1941
1942         *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1943             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1944             pf_cksum_fixup(pf_cksum_fixup(*c,
1945             ao.addr16[0], an->addr16[0], u),
1946             ao.addr16[1], an->addr16[1], u),
1947             ao.addr16[2], an->addr16[2], u),
1948             ao.addr16[3], an->addr16[3], u),
1949             ao.addr16[4], an->addr16[4], u),
1950             ao.addr16[5], an->addr16[5], u),
1951             ao.addr16[6], an->addr16[6], u),
1952             ao.addr16[7], an->addr16[7], u);
1953 }
1954 #endif /* INET6 */
1955
1956 void
1957 pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
1958     struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
1959     u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
1960 {
1961         struct pf_addr  oia, ooa;
1962
1963         PF_ACPY(&oia, ia, af);
1964         if (oa)
1965                 PF_ACPY(&ooa, oa, af);
1966
1967         /* Change inner protocol port, fix inner protocol checksum. */
1968         if (ip != NULL) {
1969                 u_int16_t       oip = *ip;
1970                 u_int32_t       opc = 0;
1971
1972                 if (pc != NULL)
1973                         opc = *pc;
1974                 *ip = np;
1975                 if (pc != NULL)
1976                         *pc = pf_cksum_fixup(*pc, oip, *ip, u);
1977                 *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
1978                 if (pc != NULL)
1979                         *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
1980         }
1981         /* Change inner ip address, fix inner ip and icmp checksums. */
1982         PF_ACPY(ia, na, af);
1983         switch (af) {
1984 #ifdef INET
1985         case AF_INET: {
1986                 u_int32_t        oh2c = *h2c;
1987
1988                 *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
1989                     oia.addr16[0], ia->addr16[0], 0),
1990                     oia.addr16[1], ia->addr16[1], 0);
1991                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1992                     oia.addr16[0], ia->addr16[0], 0),
1993                     oia.addr16[1], ia->addr16[1], 0);
1994                 *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
1995                 break;
1996         }
1997 #endif /* INET */
1998 #ifdef INET6
1999         case AF_INET6:
2000                 *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2001                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2002                     pf_cksum_fixup(pf_cksum_fixup(*ic,
2003                     oia.addr16[0], ia->addr16[0], u),
2004                     oia.addr16[1], ia->addr16[1], u),
2005                     oia.addr16[2], ia->addr16[2], u),
2006                     oia.addr16[3], ia->addr16[3], u),
2007                     oia.addr16[4], ia->addr16[4], u),
2008                     oia.addr16[5], ia->addr16[5], u),
2009                     oia.addr16[6], ia->addr16[6], u),
2010                     oia.addr16[7], ia->addr16[7], u);
2011                 break;
2012 #endif /* INET6 */
2013         }
2014         /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2015         if (oa) {
2016                 PF_ACPY(oa, na, af);
2017                 switch (af) {
2018 #ifdef INET
2019                 case AF_INET:
2020                         *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2021                             ooa.addr16[0], oa->addr16[0], 0),
2022                             ooa.addr16[1], oa->addr16[1], 0);
2023                         break;
2024 #endif /* INET */
2025 #ifdef INET6
2026                 case AF_INET6:
2027                         *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2028                             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2029                             pf_cksum_fixup(pf_cksum_fixup(*ic,
2030                             ooa.addr16[0], oa->addr16[0], u),
2031                             ooa.addr16[1], oa->addr16[1], u),
2032                             ooa.addr16[2], oa->addr16[2], u),
2033                             ooa.addr16[3], oa->addr16[3], u),
2034                             ooa.addr16[4], oa->addr16[4], u),
2035                             ooa.addr16[5], oa->addr16[5], u),
2036                             ooa.addr16[6], oa->addr16[6], u),
2037                             ooa.addr16[7], oa->addr16[7], u);
2038                         break;
2039 #endif /* INET6 */
2040                 }
2041         }
2042 }
2043
2044
2045 /*
2046  * Need to modulate the sequence numbers in the TCP SACK option
2047  * (credits to Krzysztof Pfaff for report and patch)
2048  */
2049 int
2050 pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2051     struct tcphdr *th, struct pf_state_peer *dst)
2052 {
2053         int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2054         u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2055         int copyback = 0, i, olen;
2056         struct raw_sackblock sack;
2057
2058 #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
2059         if (hlen < TCPOLEN_SACKLEN ||
2060             !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2061                 return 0;
2062
2063         while (hlen >= TCPOLEN_SACKLEN) {
2064                 olen = opt[1];
2065                 switch (*opt) {
2066                 case TCPOPT_EOL:        /* FALLTHROUGH */
2067                 case TCPOPT_NOP:
2068                         opt++;
2069                         hlen--;
2070                         break;
2071                 case TCPOPT_SACK:
2072                         if (olen > hlen)
2073                                 olen = hlen;
2074                         if (olen >= TCPOLEN_SACKLEN) {
2075                                 for (i = 2; i + TCPOLEN_SACK <= olen;
2076                                     i += TCPOLEN_SACK) {
2077                                         memcpy(&sack, &opt[i], sizeof(sack));
2078                                         pf_change_a(&sack.rblk_start, &th->th_sum,
2079                                             htonl(ntohl(sack.rblk_start) -
2080                                             dst->seqdiff), 0);
2081                                         pf_change_a(&sack.rblk_end, &th->th_sum,
2082                                             htonl(ntohl(sack.rblk_end) -
2083                                             dst->seqdiff), 0);
2084                                         memcpy(&opt[i], &sack, sizeof(sack));
2085                                 }
2086                                 copyback = 1;
2087                         }
2088                         /* FALLTHROUGH */
2089                 default:
2090                         if (olen < 2)
2091                                 olen = 2;
2092                         hlen -= olen;
2093                         opt += olen;
2094                 }
2095         }
2096
2097         if (copyback)
2098                 m_copyback(m, off + sizeof(*th), thoptlen, opts);
2099         return (copyback);
2100 }
2101
2102 void
2103 pf_send_tcp(const struct pf_rule *r, sa_family_t af,
2104     const struct pf_addr *saddr, const struct pf_addr *daddr,
2105     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2106     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2107     u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp)
2108 {
2109         struct mbuf     *m;
2110         int              len = 0, tlen;
2111 #ifdef INET
2112         struct ip       *h = NULL;
2113 #endif /* INET */
2114 #ifdef INET6
2115         struct ip6_hdr  *h6 = NULL;
2116 #endif /* INET6 */
2117         struct tcphdr   *th = NULL;
2118         char            *opt;
2119
2120         ASSERT_LWKT_TOKEN_HELD(&pf_token);
2121
2122         /* maximum segment size tcp option */
2123         tlen = sizeof(struct tcphdr);
2124         if (mss)
2125                 tlen += 4;
2126
2127         switch (af) {
2128 #ifdef INET
2129         case AF_INET:
2130                 len = sizeof(struct ip) + tlen;
2131                 break;
2132 #endif /* INET */
2133 #ifdef INET6
2134         case AF_INET6:
2135                 len = sizeof(struct ip6_hdr) + tlen;
2136                 break;
2137 #endif /* INET6 */
2138         }
2139
2140         /*
2141          * Create outgoing mbuf.
2142          *
2143          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
2144          * so make sure pf.flags is clear.
2145          */
2146         m = m_gethdr(M_NOWAIT, MT_HEADER);
2147         if (m == NULL) {
2148                 return;
2149         }
2150         if (tag)
2151                 m->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
2152         m->m_pkthdr.pf.flags = 0;
2153         m->m_pkthdr.pf.tag = rtag;
2154         /* XXX Recheck when upgrading to > 4.4 */
2155         m->m_pkthdr.pf.statekey = NULL;
2156         if (r != NULL && r->rtableid >= 0)
2157                 m->m_pkthdr.pf.rtableid = r->rtableid;
2158
2159 #ifdef ALTQ
2160         if (r != NULL && r->qid) {
2161                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
2162                 m->m_pkthdr.pf.qid = r->qid;
2163                 m->m_pkthdr.pf.ecn_af = af;
2164                 m->m_pkthdr.pf.hdr = mtod(m, struct ip *);
2165         }
2166 #endif /* ALTQ */
2167         m->m_data += max_linkhdr;
2168         m->m_pkthdr.len = m->m_len = len;
2169         m->m_pkthdr.rcvif = NULL;
2170         bzero(m->m_data, len);
2171         switch (af) {
2172 #ifdef INET
2173         case AF_INET:
2174                 h = mtod(m, struct ip *);
2175
2176                 /* IP header fields included in the TCP checksum */
2177                 h->ip_p = IPPROTO_TCP;
2178                 h->ip_len = tlen;
2179                 h->ip_src.s_addr = saddr->v4.s_addr;
2180                 h->ip_dst.s_addr = daddr->v4.s_addr;
2181
2182                 th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2183                 break;
2184 #endif /* INET */
2185 #ifdef INET6
2186         case AF_INET6:
2187                 h6 = mtod(m, struct ip6_hdr *);
2188
2189                 /* IP header fields included in the TCP checksum */
2190                 h6->ip6_nxt = IPPROTO_TCP;
2191                 h6->ip6_plen = htons(tlen);
2192                 memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2193                 memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2194
2195                 th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2196                 break;
2197 #endif /* INET6 */
2198         }
2199
2200         /* TCP header */
2201         th->th_sport = sport;
2202         th->th_dport = dport;
2203         th->th_seq = htonl(seq);
2204         th->th_ack = htonl(ack);
2205         th->th_off = tlen >> 2;
2206         th->th_flags = flags;
2207         th->th_win = htons(win);
2208
2209         if (mss) {
2210                 opt = (char *)(th + 1);
2211                 opt[0] = TCPOPT_MAXSEG;
2212                 opt[1] = 4;
2213                 mss = htons(mss);
2214                 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2215         }
2216
2217         switch (af) {
2218 #ifdef INET
2219         case AF_INET:
2220                 /* TCP checksum */
2221                 th->th_sum = in_cksum(m, len);
2222
2223                 /* Finish the IP header */
2224                 h->ip_v = 4;
2225                 h->ip_hl = sizeof(*h) >> 2;
2226                 h->ip_tos = IPTOS_LOWDELAY;
2227                 h->ip_len = len;
2228                 h->ip_off = path_mtu_discovery ? IP_DF : 0;
2229                 h->ip_ttl = ttl ? ttl : ip_defttl;
2230                 h->ip_sum = 0;
2231                 if (eh == NULL) {
2232                         lwkt_reltoken(&pf_token);
2233                         ip_output(m, NULL, NULL, 0, NULL, NULL);
2234                         lwkt_gettoken(&pf_token);
2235                 } else {
2236                         struct route             ro;
2237                         struct rtentry           rt;
2238                         struct ether_header     *e = (void *)ro.ro_dst.sa_data;
2239
2240                         if (ifp == NULL) {
2241                                 m_freem(m);
2242                                 return;
2243                         }
2244                         rt.rt_ifp = ifp;
2245                         ro.ro_rt = &rt;
2246                         ro.ro_dst.sa_len = sizeof(ro.ro_dst);
2247                         ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT;
2248                         bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN);
2249                         bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN);
2250                         e->ether_type = eh->ether_type;
2251                         /* XXX_IMPORT: later */
2252                         lwkt_reltoken(&pf_token);
2253                         ip_output(m, NULL, &ro, 0, NULL, NULL);
2254                         lwkt_gettoken(&pf_token);
2255                 }
2256                 break;
2257 #endif /* INET */
2258 #ifdef INET6
2259         case AF_INET6:
2260                 /* TCP checksum */
2261                 th->th_sum = in6_cksum(m, IPPROTO_TCP,
2262                     sizeof(struct ip6_hdr), tlen);
2263
2264                 h6->ip6_vfc |= IPV6_VERSION;
2265                 h6->ip6_hlim = IPV6_DEFHLIM;
2266
2267                 lwkt_reltoken(&pf_token);
2268                 ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
2269                 lwkt_gettoken(&pf_token);
2270                 break;
2271 #endif /* INET6 */
2272         }
2273 }
2274
2275 void
2276 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2277     struct pf_rule *r)
2278 {
2279         struct mbuf     *m0;
2280
2281         /*
2282          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
2283          * so make sure pf.flags is clear.
2284          */
2285         if ((m0 = m_copy(m, 0, M_COPYALL)) == NULL)
2286                 return;
2287
2288         m0->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
2289         m0->m_pkthdr.pf.flags = 0;
2290         /* XXX Re-Check when Upgrading to > 4.4 */
2291         m0->m_pkthdr.pf.statekey = NULL;
2292
2293         if (r->rtableid >= 0)
2294                 m0->m_pkthdr.pf.rtableid = r->rtableid;
2295
2296 #ifdef ALTQ
2297         if (r->qid) {
2298                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
2299                 m0->m_pkthdr.pf.qid = r->qid;
2300                 m0->m_pkthdr.pf.ecn_af = af;
2301                 m0->m_pkthdr.pf.hdr = mtod(m0, struct ip *);
2302         }
2303 #endif /* ALTQ */
2304
2305         switch (af) {
2306 #ifdef INET
2307         case AF_INET:
2308                 icmp_error(m0, type, code, 0, 0);
2309                 break;
2310 #endif /* INET */
2311 #ifdef INET6
2312         case AF_INET6:
2313                 icmp6_error(m0, type, code, 0);
2314                 break;
2315 #endif /* INET6 */
2316         }
2317 }
2318
2319 /*
2320  * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2321  * If n is 0, they match if they are equal. If n is != 0, they match if they
2322  * are different.
2323  */
2324 int
2325 pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2326     struct pf_addr *b, sa_family_t af)
2327 {
2328         int     match = 0;
2329
2330         switch (af) {
2331 #ifdef INET
2332         case AF_INET:
2333                 if ((a->addr32[0] & m->addr32[0]) ==
2334                     (b->addr32[0] & m->addr32[0]))
2335                         match++;
2336                 break;
2337 #endif /* INET */
2338 #ifdef INET6
2339         case AF_INET6:
2340                 if (((a->addr32[0] & m->addr32[0]) ==
2341                      (b->addr32[0] & m->addr32[0])) &&
2342                     ((a->addr32[1] & m->addr32[1]) ==
2343                      (b->addr32[1] & m->addr32[1])) &&
2344                     ((a->addr32[2] & m->addr32[2]) ==
2345                      (b->addr32[2] & m->addr32[2])) &&
2346                     ((a->addr32[3] & m->addr32[3]) ==
2347                      (b->addr32[3] & m->addr32[3])))
2348                         match++;
2349                 break;
2350 #endif /* INET6 */
2351         }
2352         if (match) {
2353                 if (n)
2354                         return (0);
2355                 else
2356                         return (1);
2357         } else {
2358                 if (n)
2359                         return (1);
2360                 else
2361                         return (0);
2362         }
2363 }
2364
2365 /*
2366  * Return 1 if b <= a <= e, otherwise return 0.
2367  */
2368 int
2369 pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2370     struct pf_addr *a, sa_family_t af)
2371 {
2372         switch (af) {
2373 #ifdef INET
2374         case AF_INET:
2375                 if ((a->addr32[0] < b->addr32[0]) ||
2376                     (a->addr32[0] > e->addr32[0]))
2377                         return (0);
2378                 break;
2379 #endif /* INET */
2380 #ifdef INET6
2381         case AF_INET6: {
2382                 int     i;
2383
2384                 /* check a >= b */
2385                 for (i = 0; i < 4; ++i)
2386                         if (a->addr32[i] > b->addr32[i])
2387                                 break;
2388                         else if (a->addr32[i] < b->addr32[i])
2389                                 return (0);
2390                 /* check a <= e */
2391                 for (i = 0; i < 4; ++i)
2392                         if (a->addr32[i] < e->addr32[i])
2393                                 break;
2394                         else if (a->addr32[i] > e->addr32[i])
2395                                 return (0);
2396                 break;
2397         }
2398 #endif /* INET6 */
2399         }
2400         return (1);
2401 }
2402
2403 int
2404 pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2405 {
2406         switch (op) {
2407         case PF_OP_IRG:
2408                 return ((p > a1) && (p < a2));
2409         case PF_OP_XRG:
2410                 return ((p < a1) || (p > a2));
2411         case PF_OP_RRG:
2412                 return ((p >= a1) && (p <= a2));
2413         case PF_OP_EQ:
2414                 return (p == a1);
2415         case PF_OP_NE:
2416                 return (p != a1);
2417         case PF_OP_LT:
2418                 return (p < a1);
2419         case PF_OP_LE:
2420                 return (p <= a1);
2421         case PF_OP_GT:
2422                 return (p > a1);
2423         case PF_OP_GE:
2424                 return (p >= a1);
2425         }
2426         return (0); /* never reached */
2427 }
2428
2429 int
2430 pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2431 {
2432         a1 = ntohs(a1);
2433         a2 = ntohs(a2);
2434         p = ntohs(p);
2435         return (pf_match(op, a1, a2, p));
2436 }
2437
2438 int
2439 pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2440 {
2441         if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2442                 return (0);
2443         return (pf_match(op, a1, a2, u));
2444 }
2445
2446 int
2447 pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2448 {
2449         if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2450                 return (0);
2451         return (pf_match(op, a1, a2, g));
2452 }
2453
2454 int
2455 pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag)
2456 {
2457         if (*tag == -1)
2458                 *tag = m->m_pkthdr.pf.tag;
2459
2460         return ((!r->match_tag_not && r->match_tag == *tag) ||
2461             (r->match_tag_not && r->match_tag != *tag));
2462 }
2463
2464 int
2465 pf_tag_packet(struct mbuf *m, int tag, int rtableid)
2466 {
2467         if (tag <= 0 && rtableid < 0)
2468                 return (0);
2469
2470         if (tag > 0)
2471                 m->m_pkthdr.pf.tag = tag;
2472         if (rtableid >= 0)
2473                 m->m_pkthdr.pf.rtableid = rtableid;
2474
2475         return (0);
2476 }
2477
2478 void
2479 pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
2480     struct pf_rule **r, struct pf_rule **a, int *match)
2481 {
2482         struct pf_anchor_stackframe     *f;
2483
2484         (*r)->anchor->match = 0;
2485         if (match)
2486                 *match = 0;
2487         if (*depth >= NELEM(pf_anchor_stack)) {
2488                 kprintf("pf_step_into_anchor: stack overflow\n");
2489                 *r = TAILQ_NEXT(*r, entries);
2490                 return;
2491         } else if (*depth == 0 && a != NULL)
2492                 *a = *r;
2493         f = pf_anchor_stack + (*depth)++;
2494         f->rs = *rs;
2495         f->r = *r;
2496         if ((*r)->anchor_wildcard) {
2497                 f->parent = &(*r)->anchor->children;
2498                 if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
2499                     NULL) {
2500                         *r = NULL;
2501                         return;
2502                 }
2503                 *rs = &f->child->ruleset;
2504         } else {
2505                 f->parent = NULL;
2506                 f->child = NULL;
2507                 *rs = &(*r)->anchor->ruleset;
2508         }
2509         *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2510 }
2511
2512 int
2513 pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
2514     struct pf_rule **r, struct pf_rule **a, int *match)
2515 {
2516         struct pf_anchor_stackframe     *f;
2517         int quick = 0;
2518
2519         do {
2520                 if (*depth <= 0)
2521                         break;
2522                 f = pf_anchor_stack + *depth - 1;
2523                 if (f->parent != NULL && f->child != NULL) {
2524                         if (f->child->match ||
2525                             (match != NULL && *match)) {
2526                                 f->r->anchor->match = 1;
2527                                 *match = 0;
2528                         }
2529                         f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
2530                         if (f->child != NULL) {
2531                                 *rs = &f->child->ruleset;
2532                                 *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2533                                 if (*r == NULL)
2534                                         continue;
2535                                 else
2536                                         break;
2537                         }
2538                 }
2539                 (*depth)--;
2540                 if (*depth == 0 && a != NULL)
2541                         *a = NULL;
2542                 *rs = f->rs;
2543                 if (f->r->anchor->match || (match != NULL && *match))
2544                         quick = f->r->quick;
2545                 *r = TAILQ_NEXT(f->r, entries);
2546         } while (*r == NULL);
2547
2548         return (quick);
2549 }
2550
2551 #ifdef INET6
2552 void
2553 pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2554     struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2555 {
2556         switch (af) {
2557 #ifdef INET
2558         case AF_INET:
2559                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2560                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2561                 break;
2562 #endif /* INET */
2563         case AF_INET6:
2564                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2565                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2566                 naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2567                 ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2568                 naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2569                 ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2570                 naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2571                 ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2572                 break;
2573         }
2574 }
2575
2576 void
2577 pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2578 {
2579         switch (af) {
2580 #ifdef INET
2581         case AF_INET:
2582                 addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2583                 break;
2584 #endif /* INET */
2585         case AF_INET6:
2586                 if (addr->addr32[3] == 0xffffffff) {
2587                         addr->addr32[3] = 0;
2588                         if (addr->addr32[2] == 0xffffffff) {
2589                                 addr->addr32[2] = 0;
2590                                 if (addr->addr32[1] == 0xffffffff) {
2591                                         addr->addr32[1] = 0;
2592                                         addr->addr32[0] =
2593                                             htonl(ntohl(addr->addr32[0]) + 1);
2594                                 } else
2595                                         addr->addr32[1] =
2596                                             htonl(ntohl(addr->addr32[1]) + 1);
2597                         } else
2598                                 addr->addr32[2] =
2599                                     htonl(ntohl(addr->addr32[2]) + 1);
2600                 } else
2601                         addr->addr32[3] =
2602                             htonl(ntohl(addr->addr32[3]) + 1);
2603                 break;
2604         }
2605 }
2606 #endif /* INET6 */
2607
2608 #define mix(a,b,c) \
2609         do {                                    \
2610                 a -= b; a -= c; a ^= (c >> 13); \
2611                 b -= c; b -= a; b ^= (a << 8);  \
2612                 c -= a; c -= b; c ^= (b >> 13); \
2613                 a -= b; a -= c; a ^= (c >> 12); \
2614                 b -= c; b -= a; b ^= (a << 16); \
2615                 c -= a; c -= b; c ^= (b >> 5);  \
2616                 a -= b; a -= c; a ^= (c >> 3);  \
2617                 b -= c; b -= a; b ^= (a << 10); \
2618                 c -= a; c -= b; c ^= (b >> 15); \
2619         } while (0)
2620
2621 /*
2622  * hash function based on bridge_hash in if_bridge.c
2623  */
2624 void
2625 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
2626     struct pf_poolhashkey *key, sa_family_t af)
2627 {
2628         u_int32_t       a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
2629
2630         switch (af) {
2631 #ifdef INET
2632         case AF_INET:
2633                 a += inaddr->addr32[0];
2634                 b += key->key32[1];
2635                 mix(a, b, c);
2636                 hash->addr32[0] = c + key->key32[2];
2637                 break;
2638 #endif /* INET */
2639 #ifdef INET6
2640         case AF_INET6:
2641                 a += inaddr->addr32[0];
2642                 b += inaddr->addr32[2];
2643                 mix(a, b, c);
2644                 hash->addr32[0] = c;
2645                 a += inaddr->addr32[1];
2646                 b += inaddr->addr32[3];
2647                 c += key->key32[1];
2648                 mix(a, b, c);
2649                 hash->addr32[1] = c;
2650                 a += inaddr->addr32[2];
2651                 b += inaddr->addr32[1];
2652                 c += key->key32[2];
2653                 mix(a, b, c);
2654                 hash->addr32[2] = c;
2655                 a += inaddr->addr32[3];
2656                 b += inaddr->addr32[0];
2657                 c += key->key32[3];
2658                 mix(a, b, c);
2659                 hash->addr32[3] = c;
2660                 break;
2661 #endif /* INET6 */
2662         }
2663 }
2664
2665 int
2666 pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
2667     struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
2668 {
2669         unsigned char            hash[16];
2670         struct pf_pool          *rpool = &r->rpool;
2671         struct pf_pooladdr      *acur = rpool->cur;
2672         struct pf_pooladdr      *cur;
2673         struct pf_addr          *raddr;
2674         struct pf_addr          *rmask;
2675         struct pf_addr          counter;
2676         struct pf_src_node       k;
2677         int cpu = mycpu->gd_cpuid;
2678         int tblidx;
2679
2680         bzero(hash, sizeof(hash));      /* avoid gcc warnings */
2681
2682         /*
2683          * NOTE! rpool->cur and rpool->tblidx can be iterators and thus
2684          *       may represent a SMP race due to the shared nature of the
2685          *       rpool structure.  We allow the race and ensure that updates
2686          *       do not create a fatal condition.
2687          */
2688         cpu_ccfence();
2689         cur = acur;
2690         raddr = &cur->addr.v.a.addr;
2691         rmask = &cur->addr.v.a.mask;
2692
2693         if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
2694             (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
2695                 k.af = af;
2696                 PF_ACPY(&k.addr, saddr, af);
2697                 if (r->rule_flag & PFRULE_RULESRCTRACK ||
2698                     r->rpool.opts & PF_POOL_STICKYADDR)
2699                         k.rule.ptr = r;
2700                 else
2701                         k.rule.ptr = NULL;
2702                 pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
2703                 *sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
2704                 if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
2705                         PF_ACPY(naddr, &(*sn)->raddr, af);
2706                         if (pf_status.debug >= PF_DEBUG_MISC) {
2707                                 kprintf("pf_map_addr: src tracking maps ");
2708                                 pf_print_host(&k.addr, 0, af);
2709                                 kprintf(" to ");
2710                                 pf_print_host(naddr, 0, af);
2711                                 kprintf("\n");
2712                         }
2713                         return (0);
2714                 }
2715         }
2716
2717         if (cur->addr.type == PF_ADDR_NOROUTE)
2718                 return (1);
2719         if (cur->addr.type == PF_ADDR_DYNIFTL) {
2720                 switch (af) {
2721 #ifdef INET
2722                 case AF_INET:
2723                         if (cur->addr.p.dyn->pfid_acnt4 < 1 &&
2724                             (rpool->opts & PF_POOL_TYPEMASK) !=
2725                             PF_POOL_ROUNDROBIN)
2726                                 return (1);
2727                         raddr = &cur->addr.p.dyn->pfid_addr4;
2728                         rmask = &cur->addr.p.dyn->pfid_mask4;
2729                         break;
2730 #endif /* INET */
2731 #ifdef INET6
2732                 case AF_INET6:
2733                         if (cur->addr.p.dyn->pfid_acnt6 < 1 &&
2734                             (rpool->opts & PF_POOL_TYPEMASK) !=
2735                             PF_POOL_ROUNDROBIN)
2736                                 return (1);
2737                         raddr = &cur->addr.p.dyn->pfid_addr6;
2738                         rmask = &cur->addr.p.dyn->pfid_mask6;
2739                         break;
2740 #endif /* INET6 */
2741                 }
2742         } else if (cur->addr.type == PF_ADDR_TABLE) {
2743                 if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
2744                         return (1); /* unsupported */
2745         } else {
2746                 raddr = &cur->addr.v.a.addr;
2747                 rmask = &cur->addr.v.a.mask;
2748         }
2749
2750         switch (rpool->opts & PF_POOL_TYPEMASK) {
2751         case PF_POOL_NONE:
2752                 PF_ACPY(naddr, raddr, af);
2753                 break;
2754         case PF_POOL_BITMASK:
2755                 PF_POOLMASK(naddr, raddr, rmask, saddr, af);
2756                 break;
2757         case PF_POOL_RANDOM:
2758                 if (init_addr != NULL && PF_AZERO(init_addr, af)) {
2759                         switch (af) {
2760 #ifdef INET
2761                         case AF_INET:
2762                                 counter.addr32[0] = htonl(karc4random());
2763                                 break;
2764 #endif /* INET */
2765 #ifdef INET6
2766                         case AF_INET6:
2767                                 if (rmask->addr32[3] != 0xffffffff)
2768                                         counter.addr32[3] =
2769                                                 htonl(karc4random());
2770                                 else
2771                                         break;
2772                                 if (rmask->addr32[2] != 0xffffffff)
2773                                         counter.addr32[2] =
2774                                                 htonl(karc4random());
2775                                 else
2776                                         break;
2777                                 if (rmask->addr32[1] != 0xffffffff)
2778                                         counter.addr32[1] =
2779                                                 htonl(karc4random());
2780                                 else
2781                                         break;
2782                                 if (rmask->addr32[0] != 0xffffffff)
2783                                         counter.addr32[0] =
2784                                                 htonl(karc4random());
2785                                 break;
2786 #endif /* INET6 */
2787                         }
2788                         PF_POOLMASK(naddr, raddr, rmask, &counter, af);
2789                         PF_ACPY(init_addr, naddr, af);
2790
2791                 } else {
2792                         counter = rpool->counter;
2793                         cpu_ccfence();
2794                         PF_AINC(&counter, af);
2795                         PF_POOLMASK(naddr, raddr, rmask, &counter, af);
2796                         rpool->counter = counter;
2797                 }
2798                 break;
2799         case PF_POOL_SRCHASH:
2800                 pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
2801                 PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
2802                 break;
2803         case PF_POOL_ROUNDROBIN:
2804                 tblidx = rpool->tblidx;
2805                 counter = rpool->counter;
2806                 if (cur->addr.type == PF_ADDR_TABLE) {
2807                         if (!pfr_pool_get(cur->addr.p.tbl,
2808                             &tblidx, &counter,
2809                             &raddr, &rmask, af)) {
2810                                 goto get_addr;
2811                         }
2812                 } else if (cur->addr.type == PF_ADDR_DYNIFTL) {
2813                         if (!pfr_pool_get(cur->addr.p.dyn->pfid_kt,
2814                             &tblidx, &counter,
2815                             &raddr, &rmask, af)) {
2816                                 goto get_addr;
2817                         }
2818                 } else if (pf_match_addr(0, raddr, rmask,
2819                                          &counter, af)) {
2820                         goto get_addr;
2821                 }
2822
2823         try_next:
2824                 if ((cur = TAILQ_NEXT(cur, entries)) == NULL)
2825                         cur = TAILQ_FIRST(&rpool->list);
2826                 if (cur->addr.type == PF_ADDR_TABLE) {
2827                         tblidx = -1;
2828                         if (pfr_pool_get(cur->addr.p.tbl,
2829                             &tblidx, &counter,
2830                             &raddr, &rmask, af)) {
2831                                 /* table contains no address of type 'af' */
2832                                 if (cur != acur)
2833                                         goto try_next;
2834                                 return (1);
2835                         }
2836                 } else if (cur->addr.type == PF_ADDR_DYNIFTL) {
2837                         tblidx = -1;
2838                         if (pfr_pool_get(cur->addr.p.dyn->pfid_kt,
2839                             &tblidx, &counter,
2840                             &raddr, &rmask, af)) {
2841                                 /* table contains no address of type 'af' */
2842                                 if (cur != acur)
2843                                         goto try_next;
2844                                 return (1);
2845                         }
2846                 } else {
2847                         raddr = &cur->addr.v.a.addr;
2848                         rmask = &cur->addr.v.a.mask;
2849                         PF_ACPY(&counter, raddr, af);
2850                 }
2851
2852         get_addr:
2853                 rpool->cur = cur;
2854                 rpool->tblidx = tblidx;
2855                 PF_ACPY(naddr, &counter, af);
2856                 if (init_addr != NULL && PF_AZERO(init_addr, af))
2857                         PF_ACPY(init_addr, naddr, af);
2858                 PF_AINC(&counter, af);
2859                 rpool->counter = counter;
2860                 break;
2861         }
2862         if (*sn != NULL)
2863                 PF_ACPY(&(*sn)->raddr, naddr, af);
2864
2865         if (pf_status.debug >= PF_DEBUG_MISC &&
2866             (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
2867                 kprintf("pf_map_addr: selected address ");
2868                 pf_print_host(naddr, 0, af);
2869                 kprintf("\n");
2870         }
2871
2872         return (0);
2873 }
2874
2875 int
2876 pf_get_sport(struct pf_pdesc *pd, sa_family_t af,
2877              u_int8_t proto, struct pf_rule *r,
2878              struct pf_addr *saddr, struct pf_addr *daddr,
2879              u_int16_t sport, u_int16_t dport,
2880              struct pf_addr *naddr, u_int16_t *nport,
2881              u_int16_t low, u_int16_t high, struct pf_src_node **sn)
2882 {
2883         struct pf_state_key_cmp key;
2884         struct pf_addr          init_addr;
2885         u_int16_t               cut;
2886         u_int32_t               hash_base = 0;
2887         int                     do_hash = 0;
2888
2889         bzero(&init_addr, sizeof(init_addr));
2890         if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
2891                 return (1);
2892
2893         if (proto == IPPROTO_ICMP) {
2894                 low = 1;
2895                 high = 65535;
2896         }
2897
2898         bzero(&key, sizeof(key));
2899         key.af = af;
2900         key.proto = proto;
2901         key.port[0] = dport;
2902         PF_ACPY(&key.addr[0], daddr, key.af);
2903
2904         do {
2905                 PF_ACPY(&key.addr[1], naddr, key.af);
2906
2907                 /*
2908                  * We want to select a port that calculates to a toeplitz hash
2909                  * that masks to the same cpu, otherwise the response may
2910                  * not see the new state.
2911                  *
2912                  * We can still do this even if the kernel is disregarding
2913                  * the hash and vectoring the packets to a specific cpu,
2914                  * but it will reduce the number of ports we can use.
2915                  */
2916                 switch(af) {
2917                 case AF_INET:
2918                         if (proto == IPPROTO_TCP) {
2919                                 do_hash = 1;
2920                                 hash_base = toeplitz_piecemeal_port(dport) ^
2921                                     toeplitz_piecemeal_addr(daddr->v4.s_addr) ^
2922                                     toeplitz_piecemeal_addr(naddr->v4.s_addr);
2923                         }
2924                         break;
2925                 case AF_INET6:
2926                         /* XXX TODO XXX */
2927                 default:
2928                         /* XXX TODO XXX */
2929                         break;
2930                 }
2931
2932                 /*
2933                  * port search; start random, step;
2934                  * similar 2 portloop in in_pcbbind
2935                  *
2936                  * WARNING! We try to match such that the kernel will
2937                  *          dispatch the translated host/port to the same
2938                  *          cpu, but this might not be possible.
2939                  *
2940                  *          In the case where the port is fixed, or for the
2941                  *          UDP case (whos toeplitz does not incorporate the
2942                  *          port), we set not_cpu_localized which ultimately
2943                  *          causes the pf_state_tree element
2944                  *
2945                  * XXX fixed ports present a problem for cpu localization.
2946                  */
2947                 if (!(proto == IPPROTO_TCP ||
2948                       proto == IPPROTO_UDP ||
2949                       proto == IPPROTO_ICMP)) {
2950                         /*
2951                          * non-specific protocol, leave port intact.
2952                          */
2953                         key.port[1] = sport;
2954                         if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2955                                 *nport = sport;
2956                                 pd->not_cpu_localized = 1;
2957                                 return (0);
2958                         }
2959                 } else if (low == 0 && high == 0) {
2960                         /*
2961                          * static-port same as originator.
2962                          */
2963                         key.port[1] = sport;
2964                         if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2965                                 *nport = sport;
2966                                 pd->not_cpu_localized = 1;
2967                                 return (0);
2968                         }
2969                 } else if (low == high) {
2970                         /*
2971                          * specific port as specified.
2972                          */
2973                         key.port[1] = htons(low);
2974                         if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2975                                 *nport = htons(low);
2976                                 pd->not_cpu_localized = 1;
2977                                 return (0);
2978                         }
2979                 } else {
2980                         /*
2981                          * normal dynamic port
2982                          */
2983                         u_int16_t tmp;
2984
2985                         if (low > high) {
2986                                 tmp = low;
2987                                 low = high;
2988                                 high = tmp;
2989                         }
2990                         /* low < high */
2991                         cut = htonl(karc4random()) % (1 + high - low) + low;
2992                         /* low <= cut <= high */
2993                         for (tmp = cut; tmp <= high; ++(tmp)) {
2994                                 key.port[1] = htons(tmp);
2995                                 if (do_hash) {
2996                                         uint32_t hash;
2997
2998                                         hash = hash_base ^
2999                                         toeplitz_piecemeal_port(key.port[1]);
3000                                         if (netisr_hashcpu(hash) != mycpuid)
3001                                                 continue;
3002                                 }
3003                                 if (pf_find_state_all(&key, PF_IN, NULL) ==
3004                                     NULL && !in_baddynamic(tmp, proto)) {
3005                                         if (proto == IPPROTO_UDP)
3006                                                 pd->not_cpu_localized = 1;
3007                                         *nport = htons(tmp);
3008                                         return (0);
3009                                 }
3010                         }
3011                         for (tmp = cut - 1; tmp >= low; --(tmp)) {
3012                                 key.port[1] = htons(tmp);
3013                                 if (do_hash) {
3014                                         uint32_t hash;
3015
3016                                         hash = hash_base ^
3017                                         toeplitz_piecemeal_port(key.port[1]);
3018                                         if (netisr_hashcpu(hash) != mycpuid)
3019                                                 continue;
3020                                 }
3021                                 if (pf_find_state_all(&key, PF_IN, NULL) ==
3022                                     NULL && !in_baddynamic(tmp, proto)) {
3023                                         if (proto == IPPROTO_UDP)
3024                                                 pd->not_cpu_localized = 1;
3025                                         *nport = htons(tmp);
3026                                         return (0);
3027                                 }
3028                         }
3029                 }
3030
3031                 /*
3032                  * Next address
3033                  */
3034                 switch (r->rpool.opts & PF_POOL_TYPEMASK) {
3035                 case PF_POOL_RANDOM:
3036                 case PF_POOL_ROUNDROBIN:
3037                         if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
3038                                 return (1);
3039                         break;
3040                 case PF_POOL_NONE:
3041                 case PF_POOL_SRCHASH:
3042                 case PF_POOL_BITMASK:
3043                 default:
3044                         return (1);
3045                 }
3046         } while (! PF_AEQ(&init_addr, naddr, af) );
3047         return (1);                                     /* none available */
3048 }
3049
3050 struct pf_rule *
3051 pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
3052     int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
3053     struct pf_addr *daddr, u_int16_t dport, int rs_num)
3054 {
3055         struct pf_rule          *r, *rm = NULL;
3056         struct pf_ruleset       *ruleset = NULL;
3057         int                      tag = -1;
3058         int                      rtableid = -1;
3059         int                      asd = 0;
3060
3061         r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
3062         while (r && rm == NULL) {
3063                 struct pf_rule_addr     *src = NULL, *dst = NULL;
3064                 struct pf_addr_wrap     *xdst = NULL;
3065                 struct pf_pooladdr      *cur;
3066
3067                 if (r->action == PF_BINAT && direction == PF_IN) {
3068                         src = &r->dst;
3069                         cur = r->rpool.cur;     /* SMP race possible */
3070                         cpu_ccfence();
3071                         if (cur)
3072                                 xdst = &cur->addr;
3073                 } else {
3074                         src = &r->src;
3075                         dst = &r->dst;
3076                 }
3077
3078                 r->evaluations++;
3079                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3080                         r = r->skip[PF_SKIP_IFP].ptr;
3081                 else if (r->direction && r->direction != direction)
3082                         r = r->skip[PF_SKIP_DIR].ptr;
3083                 else if (r->af && r->af != pd->af)
3084                         r = r->skip[PF_SKIP_AF].ptr;
3085                 else if (r->proto && r->proto != pd->proto)
3086                         r = r->skip[PF_SKIP_PROTO].ptr;
3087                 else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
3088                     src->neg, kif))
3089                         r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
3090                             PF_SKIP_DST_ADDR].ptr;
3091                 else if (src->port_op && !pf_match_port(src->port_op,
3092                     src->port[0], src->port[1], sport))
3093                         r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
3094                             PF_SKIP_DST_PORT].ptr;
3095                 else if (dst != NULL &&
3096                     PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL))
3097                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3098                 else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
3099                     0, NULL))
3100                         r = TAILQ_NEXT(r, entries);
3101                 else if (dst != NULL && dst->port_op &&
3102                     !pf_match_port(dst->port_op, dst->port[0],
3103                     dst->port[1], dport))
3104                         r = r->skip[PF_SKIP_DST_PORT].ptr;
3105                 else if (r->match_tag && !pf_match_tag(m, r, &tag))
3106                         r = TAILQ_NEXT(r, entries);
3107                 else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
3108                     IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
3109                     off, pd->hdr.tcp), r->os_fingerprint)))
3110                         r = TAILQ_NEXT(r, entries);
3111                 else {
3112                         if (r->tag)
3113                                 tag = r->tag;
3114                         if (r->rtableid >= 0)
3115                                 rtableid = r->rtableid;
3116                         if (r->anchor == NULL) {
3117                                 rm = r;
3118                         } else
3119                                 pf_step_into_anchor(&asd, &ruleset, rs_num,
3120                                     &r, NULL, NULL);
3121                 }
3122                 if (r == NULL)
3123                         pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
3124                             NULL, NULL);
3125         }
3126         if (pf_tag_packet(m, tag, rtableid))
3127                 return (NULL);
3128         if (rm != NULL && (rm->action == PF_NONAT ||
3129             rm->action == PF_NORDR || rm->action == PF_NOBINAT))
3130                 return (NULL);
3131         return (rm);
3132 }
3133
3134 struct pf_rule *
3135 pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
3136     struct pfi_kif *kif, struct pf_src_node **sn,
3137     struct pf_state_key **skw, struct pf_state_key **sks,
3138     struct pf_state_key **skp, struct pf_state_key **nkp,
3139     struct pf_addr *saddr, struct pf_addr *daddr,
3140     u_int16_t sport, u_int16_t dport)
3141 {
3142         struct pf_rule  *r = NULL;
3143
3144         if (direction == PF_OUT) {
3145                 r = pf_match_translation(pd, m, off, direction, kif, saddr,
3146                     sport, daddr, dport, PF_RULESET_BINAT);
3147                 if (r == NULL)
3148                         r = pf_match_translation(pd, m, off, direction, kif,
3149                             saddr, sport, daddr, dport, PF_RULESET_NAT);
3150         } else {
3151                 r = pf_match_translation(pd, m, off, direction, kif, saddr,
3152                     sport, daddr, dport, PF_RULESET_RDR);
3153                 if (r == NULL)
3154                         r = pf_match_translation(pd, m, off, direction, kif,
3155                             saddr, sport, daddr, dport, PF_RULESET_BINAT);
3156         }
3157
3158         if (r != NULL) {
3159                 struct pf_addr  *naddr;
3160                 u_int16_t       *nport;
3161
3162                 if (pf_state_key_setup(pd, r, skw, sks, skp, nkp,
3163                     saddr, daddr, sport, dport))
3164                         return r;
3165
3166                 /* XXX We only modify one side for now. */
3167                 naddr = &(*nkp)->addr[1];
3168                 nport = &(*nkp)->port[1];
3169
3170                 /*
3171                  * NOTE: Currently all translations will clear
3172                  *       BRIDGE_MBUF_TAGGED, telling the bridge to
3173                  *       ignore the original input encapsulation.
3174                  */
3175                 switch (r->action) {
3176                 case PF_NONAT:
3177                 case PF_NOBINAT:
3178                 case PF_NORDR:
3179                         return (NULL);
3180                 case PF_NAT:
3181                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3182                         if (pf_get_sport(pd, pd->af, pd->proto, r,
3183                             saddr, daddr, sport, dport,
3184                             naddr, nport, r->rpool.proxy_port[0],
3185                             r->rpool.proxy_port[1], sn)) {
3186                                 DPFPRINTF(PF_DEBUG_MISC,
3187                                     ("pf: NAT proxy port allocation "
3188                                     "(%u-%u) failed\n",
3189                                     r->rpool.proxy_port[0],
3190                                     r->rpool.proxy_port[1]));
3191                                 return (NULL);
3192                         }
3193                         break;
3194                 case PF_BINAT:
3195                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3196                         switch (direction) {
3197                         case PF_OUT:
3198                                 if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
3199                                         switch (pd->af) {
3200 #ifdef INET
3201                                         case AF_INET:
3202                                                 if (r->rpool.cur->addr.p.dyn->
3203                                                     pfid_acnt4 < 1)
3204                                                         return (NULL);
3205                                                 PF_POOLMASK(naddr,
3206                                                     &r->rpool.cur->addr.p.dyn->
3207                                                     pfid_addr4,
3208                                                     &r->rpool.cur->addr.p.dyn->
3209                                                     pfid_mask4,
3210                                                     saddr, AF_INET);
3211                                                 break;
3212 #endif /* INET */
3213 #ifdef INET6
3214                                         case AF_INET6:
3215                                                 if (r->rpool.cur->addr.p.dyn->
3216                                                     pfid_acnt6 < 1)
3217                                                         return (NULL);
3218                                                 PF_POOLMASK(naddr,
3219                                                     &r->rpool.cur->addr.p.dyn->
3220                                                     pfid_addr6,
3221                                                     &r->rpool.cur->addr.p.dyn->
3222                                                     pfid_mask6,
3223                                                     saddr, AF_INET6);
3224                                                 break;
3225 #endif /* INET6 */
3226                                         }
3227                                 } else
3228                                         PF_POOLMASK(naddr,
3229                                             &r->rpool.cur->addr.v.a.addr,
3230                                             &r->rpool.cur->addr.v.a.mask,
3231                                             saddr, pd->af);
3232                                 break;
3233                         case PF_IN:
3234                                 if (r->src.addr.type == PF_ADDR_DYNIFTL) {
3235                                         switch (pd->af) {
3236 #ifdef INET
3237                                         case AF_INET:
3238                                                 if (r->src.addr.p.dyn->
3239                                                     pfid_acnt4 < 1)
3240                                                         return (NULL);
3241                                                 PF_POOLMASK(naddr,
3242                                                     &r->src.addr.p.dyn->
3243                                                     pfid_addr4,
3244                                                     &r->src.addr.p.dyn->
3245                                                     pfid_mask4,
3246                                                     daddr, AF_INET);
3247                                                 break;
3248 #endif /* INET */
3249 #ifdef INET6
3250                                         case AF_INET6:
3251                                                 if (r->src.addr.p.dyn->
3252                                                     pfid_acnt6 < 1)
3253                                                         return (NULL);
3254                                                 PF_POOLMASK(naddr,
3255                                                     &r->src.addr.p.dyn->
3256                                                     pfid_addr6,
3257                                                     &r->src.addr.p.dyn->
3258                                                     pfid_mask6,
3259                                                     daddr, AF_INET6);
3260                                                 break;
3261 #endif /* INET6 */
3262                                         }
3263                                 } else
3264                                         PF_POOLMASK(naddr,
3265                                             &r->src.addr.v.a.addr,
3266                                             &r->src.addr.v.a.mask, daddr,
3267                                             pd->af);
3268                                 break;
3269                         }
3270                         break;
3271                 case PF_RDR: {
3272                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3273                         if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
3274                                 return (NULL);
3275                         if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
3276                             PF_POOL_BITMASK)
3277                                 PF_POOLMASK(naddr, naddr,
3278                                     &r->rpool.cur->addr.v.a.mask, daddr,
3279                                     pd->af);
3280
3281                         if (r->rpool.proxy_port[1]) {
3282                                 u_int32_t       tmp_nport;
3283
3284                                 tmp_nport = ((ntohs(dport) -
3285                                     ntohs(r->dst.port[0])) %
3286                                     (r->rpool.proxy_port[1] -
3287                                     r->rpool.proxy_port[0] + 1)) +
3288                                     r->rpool.proxy_port[0];
3289
3290                                 /* wrap around if necessary */
3291                                 if (tmp_nport > 65535)
3292                                         tmp_nport -= 65535;
3293                                 *nport = htons((u_int16_t)tmp_nport);
3294                         } else if (r->rpool.proxy_port[0]) {
3295                                 *nport = htons(r->rpool.proxy_port[0]);
3296                         }
3297                         pd->not_cpu_localized = 1;
3298                         break;
3299                 }
3300                 default:
3301                         return (NULL);
3302                 }
3303         }
3304
3305         return (r);
3306 }
3307
3308 struct netmsg_hashlookup {
3309         struct netmsg_base      base;
3310         struct inpcb            **nm_pinp;
3311         struct inpcbinfo        *nm_pcbinfo;
3312         struct pf_addr          *nm_saddr;
3313         struct pf_addr          *nm_daddr;
3314         uint16_t                nm_sport;
3315         uint16_t                nm_dport;
3316         sa_family_t             nm_af;
3317 };
3318
3319 #ifdef PF_SOCKET_LOOKUP_DOMSG
3320 static void
3321 in_pcblookup_hash_handler(netmsg_t msg)
3322 {
3323         struct netmsg_hashlookup *rmsg = (struct netmsg_hashlookup *)msg;
3324
3325         if (rmsg->nm_af == AF_INET)
3326                 *rmsg->nm_pinp = in_pcblookup_hash(rmsg->nm_pcbinfo,
3327                     rmsg->nm_saddr->v4, rmsg->nm_sport, rmsg->nm_daddr->v4,
3328                     rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
3329 #ifdef INET6
3330         else
3331                 *rmsg->nm_pinp = in6_pcblookup_hash(rmsg->nm_pcbinfo,
3332                     &rmsg->nm_saddr->v6, rmsg->nm_sport, &rmsg->nm_daddr->v6,
3333                     rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
3334 #endif /* INET6 */
3335         lwkt_replymsg(&rmsg->base.lmsg, 0);
3336 }
3337 #endif  /* PF_SOCKET_LOOKUP_DOMSG */
3338
3339 int
3340 pf_socket_lookup(int direction, struct pf_pdesc *pd)
3341 {
3342         struct pf_addr          *saddr, *daddr;
3343         u_int16_t                sport, dport;
3344         struct inpcbinfo        *pi;
3345         struct inpcb            *inp;
3346         struct netmsg_hashlookup *msg = NULL;
3347 #ifdef PF_SOCKET_LOOKUP_DOMSG
3348         struct netmsg_hashlookup msg0;
3349 #endif
3350         int                      pi_cpu = 0;
3351
3352         if (pd == NULL)
3353                 return (-1);
3354         pd->lookup.uid = UID_MAX;
3355         pd->lookup.gid = GID_MAX;
3356         pd->lookup.pid = NO_PID;
3357         if (direction == PF_IN) {
3358                 saddr = pd->src;
3359                 daddr = pd->dst;
3360         } else {
3361                 saddr = pd->dst;
3362                 daddr = pd->src;
3363         }
3364         switch (pd->proto) {
3365         case IPPROTO_TCP:
3366                 if (pd->hdr.tcp == NULL)
3367                         return (-1);
3368                 sport = pd->hdr.tcp->th_sport;
3369                 dport = pd->hdr.tcp->th_dport;
3370
3371                 pi_cpu = tcp_addrcpu(saddr->v4.s_addr, sport, daddr->v4.s_addr, dport);
3372                 pi = &tcbinfo[pi_cpu];
3373                 /*
3374                  * Our netstack runs lockless on MP systems
3375                  * (only for TCP connections at the moment).
3376                  *
3377                  * As we are not allowed to read another CPU's tcbinfo,
3378                  * we have to ask that CPU via remote call to search the
3379                  * table for us.
3380                  *
3381                  * Prepare a msg iff data belongs to another CPU.
3382                  */
3383                 if (pi_cpu != mycpu->gd_cpuid) {
3384 #ifdef PF_SOCKET_LOOKUP_DOMSG
3385                         /*
3386                          * NOTE:
3387                          *
3388                          * Following lwkt_domsg() is dangerous and could
3389                          * lockup the network system, e.g.
3390                          *
3391                          * On 2 CPU system:
3392                          * netisr0 domsg to netisr1 (due to lookup)
3393                          * netisr1 domsg to netisr0 (due to lookup)
3394                          *
3395                          * We simply return -1 here, since we are probably
3396                          * called before NAT, so the TCP packet should
3397                          * already be on the correct CPU.
3398                          */
3399                         msg = &msg0;
3400                         netmsg_init(&msg->base, NULL, &curthread->td_msgport,
3401                                     0, in_pcblookup_hash_handler);
3402                         msg->nm_pinp = &inp;
3403                         msg->nm_pcbinfo = pi;
3404                         msg->nm_saddr = saddr;
3405                         msg->nm_sport = sport;
3406                         msg->nm_daddr = daddr;
3407                         msg->nm_dport = dport;
3408                         msg->nm_af = pd->af;
3409 #else   /* !PF_SOCKET_LOOKUP_DOMSG */
3410                         kprintf("pf_socket_lookup: tcp packet not on the "
3411                                 "correct cpu %d, cur cpu %d\n",
3412                                 pi_cpu, mycpuid);
3413                         print_backtrace(-1);
3414                         return -1;
3415 #endif  /* PF_SOCKET_LOOKUP_DOMSG */
3416                 }
3417                 break;
3418         case IPPROTO_UDP:
3419                 if (pd->hdr.udp == NULL)
3420                         return (-1);
3421                 sport = pd->hdr.udp->uh_sport;
3422                 dport = pd->hdr.udp->uh_dport;
3423                 pi = &udbinfo[mycpuid];
3424                 break;
3425         default:
3426                 return (-1);
3427         }
3428         if (direction != PF_IN) {
3429                 u_int16_t       p;
3430
3431                 p = sport;
3432                 sport = dport;
3433                 dport = p;
3434         }
3435         switch (pd->af) {
3436 #ifdef INET6
3437         case AF_INET6:
3438                 /*
3439                  * Query other CPU, second part
3440                  *
3441                  * msg only gets initialized when:
3442                  * 1) packet is TCP
3443                  * 2) the info belongs to another CPU
3444                  *
3445                  * Use some switch/case magic to avoid code duplication.
3446                  */
3447                 if (msg == NULL) {
3448                         inp = in6_pcblookup_hash(pi, &saddr->v6, sport,
3449                             &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL);
3450
3451                         if (inp == NULL)
3452                                 return (-1);
3453                         break;
3454                 }
3455                 /* FALLTHROUGH if SMP and on other CPU */
3456 #endif /* INET6 */
3457         case AF_INET:
3458                 if (msg != NULL) {
3459                         lwkt_domsg(netisr_cpuport(pi_cpu),
3460                                      &msg->base.lmsg, 0);
3461                 } else
3462                 {
3463                         inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4,
3464                             dport, INPLOOKUP_WILDCARD, NULL);
3465                 }
3466                 if (inp == NULL)
3467                         return (-1);
3468                 break;
3469
3470         default:
3471                 return (-1);
3472         }
3473         pd->lookup.uid = inp->inp_socket->so_cred->cr_uid;
3474         pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0];
3475         return (1);
3476 }
3477
3478 u_int8_t
3479 pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3480 {
3481         int              hlen;
3482         u_int8_t         hdr[60];
3483         u_int8_t        *opt, optlen;
3484         u_int8_t         wscale = 0;
3485
3486         hlen = th_off << 2;             /* hlen <= sizeof(hdr) */
3487         if (hlen <= sizeof(struct tcphdr))
3488                 return (0);
3489         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3490                 return (0);
3491         opt = hdr + sizeof(struct tcphdr);
3492         hlen -= sizeof(struct tcphdr);
3493         while (hlen >= 3) {
3494                 switch (*opt) {
3495                 case TCPOPT_EOL:
3496                 case TCPOPT_NOP:
3497                         ++opt;
3498                         --hlen;
3499                         break;
3500                 case TCPOPT_WINDOW:
3501                         wscale = opt[2];
3502                         if (wscale > TCP_MAX_WINSHIFT)
3503                                 wscale = TCP_MAX_WINSHIFT;
3504                         wscale |= PF_WSCALE_FLAG;
3505                         /* FALLTHROUGH */
3506                 default:
3507                         optlen = opt[1];
3508                         if (optlen < 2)
3509                                 optlen = 2;
3510                         hlen -= optlen;
3511                         opt += optlen;
3512                         break;
3513                 }
3514         }
3515         return (wscale);
3516 }
3517
3518 u_int16_t
3519 pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3520 {
3521         int              hlen;
3522         u_int8_t         hdr[60];
3523         u_int8_t        *opt, optlen;
3524         u_int16_t        mss = tcp_mssdflt;
3525
3526         hlen = th_off << 2;     /* hlen <= sizeof(hdr) */
3527         if (hlen <= sizeof(struct tcphdr))
3528                 return (0);
3529         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3530                 return (0);
3531         opt = hdr + sizeof(struct tcphdr);
3532         hlen -= sizeof(struct tcphdr);
3533         while (hlen >= TCPOLEN_MAXSEG) {
3534                 switch (*opt) {
3535                 case TCPOPT_EOL:
3536                 case TCPOPT_NOP:
3537                         ++opt;
3538                         --hlen;
3539                         break;
3540                 case TCPOPT_MAXSEG:
3541                         bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
3542                         /* FALLTHROUGH */
3543                 default:
3544                         optlen = opt[1];
3545                         if (optlen < 2)
3546                                 optlen = 2;
3547                         hlen -= optlen;
3548                         opt += optlen;
3549                         break;
3550                 }
3551         }
3552         return (mss);
3553 }
3554
3555 u_int16_t
3556 pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer)
3557 {
3558 #ifdef INET
3559         struct sockaddr_in      *dst;
3560         struct route             ro;
3561 #endif /* INET */
3562 #ifdef INET6
3563         struct sockaddr_in6     *dst6;
3564         struct route_in6         ro6;
3565 #endif /* INET6 */
3566         struct rtentry          *rt = NULL;
3567         int                      hlen = 0;
3568         u_int16_t                mss = tcp_mssdflt;
3569
3570         switch (af) {
3571 #ifdef INET
3572         case AF_INET:
3573                 hlen = sizeof(struct ip);
3574                 bzero(&ro, sizeof(ro));
3575                 dst = (struct sockaddr_in *)&ro.ro_dst;
3576                 dst->sin_family = AF_INET;
3577                 dst->sin_len = sizeof(*dst);
3578                 dst->sin_addr = addr->v4;
3579                 rtalloc_ign(&ro, (RTF_CLONING | RTF_PRCLONING));
3580                 rt = ro.ro_rt;
3581                 break;
3582 #endif /* INET */
3583 #ifdef INET6
3584         case AF_INET6:
3585                 hlen = sizeof(struct ip6_hdr);
3586                 bzero(&ro6, sizeof(ro6));
3587                 dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
3588                 dst6->sin6_family = AF_INET6;
3589                 dst6->sin6_len = sizeof(*dst6);
3590                 dst6->sin6_addr = addr->v6;
3591                 rtalloc_ign((struct route *)&ro6, (RTF_CLONING | RTF_PRCLONING));
3592                 rt = ro6.ro_rt;
3593                 break;
3594 #endif /* INET6 */
3595         }
3596
3597         if (rt && rt->rt_ifp) {
3598                 mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
3599                 mss = max(tcp_mssdflt, mss);
3600                 RTFREE(rt);
3601         }
3602         mss = min(mss, offer);
3603         mss = max(mss, 64);             /* sanity - at least max opt space */
3604         return (mss);
3605 }
3606
3607 void
3608 pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
3609 {
3610         struct pf_rule *r = s->rule.ptr;
3611
3612         s->rt_kif = NULL;
3613         if (!r->rt || r->rt == PF_FASTROUTE)
3614                 return;
3615         switch (s->key[PF_SK_WIRE]->af) {
3616 #ifdef INET
3617         case AF_INET:
3618                 pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL,
3619                     &s->nat_src_node);
3620                 s->rt_kif = r->rpool.cur->kif;
3621                 break;
3622 #endif /* INET */
3623 #ifdef INET6
3624         case AF_INET6:
3625                 pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL,
3626                     &s->nat_src_node);
3627                 s->rt_kif = r->rpool.cur->kif;
3628                 break;
3629 #endif /* INET6 */
3630         }
3631 }
3632
3633 u_int32_t
3634 pf_tcp_iss(struct pf_pdesc *pd)
3635 {
3636         MD5_CTX ctx;
3637         u_int32_t digest[4];
3638
3639         if (pf_tcp_secret_init == 0) {
3640                 lwkt_gettoken(&pf_gtoken);
3641                 if (pf_tcp_secret_init == 0) {
3642                         karc4rand(pf_tcp_secret, sizeof(pf_tcp_secret));
3643                         MD5Init(&pf_tcp_secret_ctx);
3644                         MD5Update(&pf_tcp_secret_ctx, pf_tcp_secret,
3645                             sizeof(pf_tcp_secret));
3646                         pf_tcp_secret_init = 1;
3647                 }
3648                 lwkt_reltoken(&pf_gtoken);
3649         }
3650         ctx = pf_tcp_secret_ctx;
3651
3652         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
3653         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
3654         if (pd->af == AF_INET6) {
3655                 MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
3656                 MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
3657         } else {
3658                 MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
3659                 MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
3660         }
3661         MD5Final((u_char *)digest, &ctx);
3662         pf_tcp_iss_off += 4096;
3663
3664         return (digest[0] + pd->hdr.tcp->th_seq + pf_tcp_iss_off);
3665 }
3666
3667 int
3668 pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3669     struct pfi_kif *kif, struct mbuf *m, int off, void *h,
3670     struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm,
3671     struct ifqueue *ifq, struct inpcb *inp)
3672 {
3673         struct pf_rule          *nr = NULL;
3674         struct pf_addr          *saddr = pd->src, *daddr = pd->dst;
3675         sa_family_t              af = pd->af;
3676         struct pf_rule          *r, *a = NULL;
3677         struct pf_ruleset       *ruleset = NULL;
3678         struct pf_src_node      *nsn = NULL;
3679         struct tcphdr           *th = pd->hdr.tcp;
3680         struct pf_state_key     *skw = NULL, *sks = NULL;
3681         struct pf_state_key     *sk = NULL, *nk = NULL;
3682         u_short                  reason;
3683         int                      rewrite = 0, hdrlen = 0;
3684         int                      tag = -1, rtableid = -1;
3685         int                      asd = 0;
3686         int                      match = 0;
3687         int                      state_icmp = 0;
3688         u_int16_t                sport = 0, dport = 0;
3689         u_int16_t                bproto_sum = 0, bip_sum = 0;
3690         u_int8_t                 icmptype = 0, icmpcode = 0;
3691
3692
3693         if (direction == PF_IN && pf_check_congestion(ifq)) {
3694                 REASON_SET(&reason, PFRES_CONGEST);
3695                 return (PF_DROP);
3696         }
3697
3698         if (inp != NULL)
3699                 pd->lookup.done = pf_socket_lookup(direction, pd);
3700         else if (debug_pfugidhack) {
3701                 DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
3702                 pd->lookup.done = pf_socket_lookup(direction, pd);
3703         }
3704
3705         switch (pd->proto) {
3706         case IPPROTO_TCP:
3707                 sport = th->th_sport;
3708                 dport = th->th_dport;
3709                 hdrlen = sizeof(*th);
3710                 break;
3711         case IPPROTO_UDP:
3712                 sport = pd->hdr.udp->uh_sport;
3713                 dport = pd->hdr.udp->uh_dport;
3714                 hdrlen = sizeof(*pd->hdr.udp);
3715                 break;
3716 #ifdef INET
3717         case IPPROTO_ICMP:
3718                 if (pd->af != AF_INET)
3719                         break;
3720                 sport = dport = pd->hdr.icmp->icmp_id;
3721                 hdrlen = sizeof(*pd->hdr.icmp);
3722                 icmptype = pd->hdr.icmp->icmp_type;
3723                 icmpcode = pd->hdr.icmp->icmp_code;
3724
3725                 if (icmptype == ICMP_UNREACH ||
3726                     icmptype == ICMP_SOURCEQUENCH ||
3727                     icmptype == ICMP_REDIRECT ||
3728                     icmptype == ICMP_TIMXCEED ||
3729                     icmptype == ICMP_PARAMPROB)
3730                         state_icmp++;
3731                 break;
3732 #endif /* INET */
3733 #ifdef INET6
3734         case IPPROTO_ICMPV6:
3735                 if (af != AF_INET6)
3736                         break;
3737                 sport = dport = pd->hdr.icmp6->icmp6_id;
3738                 hdrlen = sizeof(*pd->hdr.icmp6);
3739                 icmptype = pd->hdr.icmp6->icmp6_type;
3740                 icmpcode = pd->hdr.icmp6->icmp6_code;
3741
3742                 if (icmptype == ICMP6_DST_UNREACH ||
3743                     icmptype == ICMP6_PACKET_TOO_BIG ||
3744                     icmptype == ICMP6_TIME_EXCEEDED ||
3745                     icmptype == ICMP6_PARAM_PROB)
3746                         state_icmp++;
3747                 break;
3748 #endif /* INET6 */
3749         default:
3750                 sport = dport = hdrlen = 0;
3751                 break;
3752         }
3753
3754         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3755
3756         /* check packet for BINAT/NAT/RDR */
3757         if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn,
3758             &skw, &sks, &sk, &nk, saddr, daddr, sport, dport)) != NULL) {
3759                 if (nk == NULL || sk == NULL) {
3760                         REASON_SET(&reason, PFRES_MEMORY);
3761                         goto cleanup;
3762                 }
3763
3764                 if (pd->ip_sum)
3765                         bip_sum = *pd->ip_sum;
3766
3767                 m->m_flags &= ~M_HASH;
3768                 switch (pd->proto) {
3769                 case IPPROTO_TCP:
3770                         bproto_sum = th->th_sum;
3771                         pd->proto_sum = &th->th_sum;
3772
3773                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3774                             nk->port[pd->sidx] != sport) {
3775                                 pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
3776                                     &th->th_sum, &nk->addr[pd->sidx],
3777                                     nk->port[pd->sidx], 0, af);
3778                                 pd->sport = &th->th_sport;
3779                                 sport = th->th_sport;
3780                         }
3781
3782                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3783                             nk->port[pd->didx] != dport) {
3784                                 pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
3785                                     &th->th_sum, &nk->addr[pd->didx],
3786                                     nk->port[pd->didx], 0, af);
3787                                 dport = th->th_dport;
3788                                 pd->dport = &th->th_dport;
3789                         }
3790                         rewrite++;
3791                         break;
3792                 case IPPROTO_UDP:
3793                         bproto_sum = pd->hdr.udp->uh_sum;
3794                         pd->proto_sum = &pd->hdr.udp->uh_sum;
3795
3796                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3797                             nk->port[pd->sidx] != sport) {
3798                                 pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
3799                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3800                                     &nk->addr[pd->sidx],
3801                                     nk->port[pd->sidx], 1, af);
3802                                 sport = pd->hdr.udp->uh_sport;
3803                                 pd->sport = &pd->hdr.udp->uh_sport;
3804                         }
3805
3806                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3807                             nk->port[pd->didx] != dport) {
3808                                 pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3809                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3810                                     &nk->addr[pd->didx],
3811                                     nk->port[pd->didx], 1, af);
3812                                 dport = pd->hdr.udp->uh_dport;
3813                                 pd->dport = &pd->hdr.udp->uh_dport;
3814                         }
3815                         rewrite++;
3816                         break;
3817 #ifdef INET
3818                 case IPPROTO_ICMP:
3819                         nk->port[0] = nk->port[1];
3820                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3821                                 pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3822                                     nk->addr[pd->sidx].v4.s_addr, 0);
3823
3824                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3825                                 pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3826                                     nk->addr[pd->didx].v4.s_addr, 0);
3827
3828                         if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3829                                 pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3830                                     pd->hdr.icmp->icmp_cksum, sport,
3831                                     nk->port[1], 0);
3832                                 pd->hdr.icmp->icmp_id = nk->port[1];
3833                                 pd->sport = &pd->hdr.icmp->icmp_id;
3834                         }
3835                         m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3836                         break;
3837 #endif /* INET */
3838 #ifdef INET6
3839                 case IPPROTO_ICMPV6:
3840                         nk->port[0] = nk->port[1];
3841                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3842                                 pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3843                                     &nk->addr[pd->sidx], 0);
3844
3845                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3846                                 pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3847                                     &nk->addr[pd->didx], 0);
3848                         rewrite++;
3849                         break;
3850 #endif /* INET */
3851                 default:
3852                         switch (af) {
3853 #ifdef INET
3854                         case AF_INET:
3855                                 if (PF_ANEQ(saddr,
3856                                     &nk->addr[pd->sidx], AF_INET))
3857                                         pf_change_a(&saddr->v4.s_addr,
3858                                             pd->ip_sum,
3859                                             nk->addr[pd->sidx].v4.s_addr, 0);
3860
3861                                 if (PF_ANEQ(daddr,
3862                                     &nk->addr[pd->didx], AF_INET))
3863                                         pf_change_a(&daddr->v4.s_addr,
3864                                             pd->ip_sum,
3865                                             nk->addr[pd->didx].v4.s_addr, 0);
3866                                 break;
3867 #endif /* INET */
3868 #ifdef INET6
3869                         case AF_INET6:
3870                                 if (PF_ANEQ(saddr,
3871                                     &nk->addr[pd->sidx], AF_INET6))
3872                                         PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3873
3874                                 if (PF_ANEQ(daddr,
3875                                     &nk->addr[pd->didx], AF_INET6))
3876                                         PF_ACPY(saddr, &nk->addr[pd->didx], af);
3877                                 break;
3878 #endif /* INET */
3879                         }
3880                         break;
3881                 }
3882                 if (nr->natpass)
3883                         r = NULL;
3884                 pd->nat_rule = nr;
3885         }
3886
3887         while (r != NULL) {
3888                 r->evaluations++;
3889                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3890                         r = r->skip[PF_SKIP_IFP].ptr;
3891                 else if (r->direction && r->direction != direction)
3892                         r = r->skip[PF_SKIP_DIR].ptr;
3893                 else if (r->af && r->af != af)
3894                         r = r->skip[PF_SKIP_AF].ptr;
3895                 else if (r->proto && r->proto != pd->proto)
3896                         r = r->skip[PF_SKIP_PROTO].ptr;
3897                 else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3898                     r->src.neg, kif))
3899                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3900                 /* tcp/udp only. port_op always 0 in other cases */
3901                 else if (r->src.port_op && !pf_match_port(r->src.port_op,
3902                     r->src.port[0], r->src.port[1], sport))
3903                         r = r->skip[PF_SKIP_SRC_PORT].ptr;
3904                 else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3905                     r->dst.neg, NULL))
3906                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3907                 /* tcp/udp only. port_op always 0 in other cases */
3908                 else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3909                     r->dst.port[0], r->dst.port[1], dport))
3910                         r = r->skip[PF_SKIP_DST_PORT].ptr;
3911                 /* icmp only. type always 0 in other cases */
3912                 else if (r->type && r->type != icmptype + 1)
3913                         r = TAILQ_NEXT(r, entries);
3914                 /* icmp only. type always 0 in other cases */
3915                 else if (r->code && r->code != icmpcode + 1)
3916                         r = TAILQ_NEXT(r, entries);
3917                 else if (r->tos && !(r->tos == pd->tos))
3918                         r = TAILQ_NEXT(r, entries);
3919                 else if (r->rule_flag & PFRULE_FRAGMENT)
3920                         r = TAILQ_NEXT(r, entries);
3921                 else if (pd->proto == IPPROTO_TCP &&
3922                     (r->flagset & th->th_flags) != r->flags)
3923                         r = TAILQ_NEXT(r, entries);
3924                 /* tcp/udp only. uid.op always 0 in other cases */
3925                 else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3926                     pf_socket_lookup(direction, pd), 1)) &&
3927                     !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3928                     pd->lookup.uid))
3929                         r = TAILQ_NEXT(r, entries);
3930                 /* tcp/udp only. gid.op always 0 in other cases */
3931                 else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3932                     pf_socket_lookup(direction, pd), 1)) &&
3933                     !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3934                     pd->lookup.gid))
3935                         r = TAILQ_NEXT(r, entries);
3936                 else if (r->prob &&
3937                   r->prob <= karc4random())
3938                         r = TAILQ_NEXT(r, entries);
3939                 else if (r->match_tag && !pf_match_tag(m, r, &tag))
3940                         r = TAILQ_NEXT(r, entries);
3941                 else if (r->os_fingerprint != PF_OSFP_ANY &&
3942                     (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3943                     pf_osfp_fingerprint(pd, m, off, th),
3944                     r->os_fingerprint)))
3945                         r = TAILQ_NEXT(r, entries);
3946                 else {
3947                         if (r->tag)
3948                                 tag = r->tag;
3949                         if (r->rtableid >= 0)
3950                                 rtableid = r->rtableid;
3951                         if (r->anchor == NULL) {
3952                                 match = 1;
3953                                 *rm = r;
3954                                 *am = a;
3955                                 *rsm = ruleset;
3956                                 if ((*rm)->quick)
3957                                         break;
3958                                 r = TAILQ_NEXT(r, entries);
3959                         } else
3960                                 pf_step_into_anchor(&asd, &ruleset,
3961                                     PF_RULESET_FILTER, &r, &a, &match);
3962                 }
3963                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3964                     PF_RULESET_FILTER, &r, &a, &match))
3965                         break;
3966         }
3967         r = *rm;
3968         a = *am;
3969         ruleset = *rsm;
3970
3971         REASON_SET(&reason, PFRES_MATCH);
3972
3973         if (r->log || (nr != NULL && nr->log)) {
3974                 if (rewrite)
3975                         m_copyback(m, off, hdrlen, pd->hdr.any);
3976                 PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
3977                     a, ruleset, pd);
3978         }
3979
3980         if ((r->action == PF_DROP) &&
3981             ((r->rule_flag & PFRULE_RETURNRST) ||
3982             (r->rule_flag & PFRULE_RETURNICMP) ||
3983             (r->rule_flag & PFRULE_RETURN))) {
3984                 /* undo NAT changes, if they have taken place */
3985                 if (nr != NULL) {
3986                         PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3987                         PF_ACPY(daddr, &sk->addr[pd->didx], af);
3988                         if (pd->sport)
3989                                 *pd->sport = sk->port[pd->sidx];
3990                         if (pd->dport)
3991                                 *pd->dport = sk->port[pd->didx];
3992                         if (pd->proto_sum)
3993                                 *pd->proto_sum = bproto_sum;
3994                         if (pd->ip_sum)
3995                                 *pd->ip_sum = bip_sum;
3996                         m_copyback(m, off, hdrlen, pd->hdr.any);
3997                 }
3998                 if (pd->proto == IPPROTO_TCP &&
3999                     ((r->rule_flag & PFRULE_RETURNRST) ||
4000                     (r->rule_flag & PFRULE_RETURN)) &&
4001                     !(th->th_flags & TH_RST)) {
4002                         u_int32_t        ack = ntohl(th->th_seq) + pd->p_len;
4003                         int              len = 0;
4004                         struct ip       *h4;
4005 #ifdef INET6
4006                         struct ip6_hdr  *h6;
4007 #endif
4008                         switch (af) {
4009                         case AF_INET:
4010                                 h4 = mtod(m, struct ip *);
4011                                 len = h4->ip_len - off;
4012                                 break;
4013 #ifdef INET6
4014                         case AF_INET6:
4015                                 h6 = mtod(m, struct ip6_hdr *);
4016                                 len = h6->ip6_plen - (off - sizeof(*h6));
4017                                 break;
4018 #endif
4019                         }
4020
4021                         if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
4022                                 REASON_SET(&reason, PFRES_PROTCKSUM);
4023                         else {
4024                                 if (th->th_flags & TH_SYN)
4025                                         ack++;
4026                                 if (th->th_flags & TH_FIN)
4027                                         ack++;
4028                                 pf_send_tcp(r, af, pd->dst,
4029                                     pd->src, th->th_dport, th->th_sport,
4030                                     ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
4031                                     r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp);
4032                         }
4033                 } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
4034                     r->return_icmp)
4035                         pf_send_icmp(m, r->return_icmp >> 8,
4036                             r->return_icmp & 255, af, r);
4037                 else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
4038                     r->return_icmp6)
4039                         pf_send_icmp(m, r->return_icmp6 >> 8,
4040                             r->return_icmp6 & 255, af, r);
4041         }
4042
4043         if (r->action == PF_DROP)
4044                 goto cleanup;
4045
4046         if (pf_tag_packet(m, tag, rtableid)) {
4047                 REASON_SET(&reason, PFRES_MEMORY);
4048                 goto cleanup;
4049         }
4050
4051         if (!state_icmp && (r->keep_state || nr != NULL ||
4052             (pd->flags & PFDESC_TCP_NORM))) {
4053                 int action;
4054                 action = pf_create_state(r, nr, a, pd, nsn, skw, sks, nk, sk, m,
4055                     off, sport, dport, &rewrite, kif, sm, tag, bproto_sum,
4056                     bip_sum, hdrlen);
4057                 if (action != PF_PASS)
4058                         return (action);
4059         }
4060
4061         /* copy back packet headers if we performed NAT operations */
4062         if (rewrite)
4063                 m_copyback(m, off, hdrlen, pd->hdr.any);
4064
4065         return (PF_PASS);
4066
4067 cleanup:
4068         if (sk != NULL)
4069                 kfree(sk, M_PFSTATEKEYPL);
4070         if (nk != NULL)
4071                 kfree(nk, M_PFSTATEKEYPL);
4072         return (PF_DROP);
4073 }
4074
4075 static __inline int
4076 pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
4077     struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *skw,
4078     struct pf_state_key *sks, struct pf_state_key *nk, struct pf_state_key *sk,
4079     struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite,
4080     struct pfi_kif *kif, struct pf_state **sm, int tag, u_int16_t bproto_sum,
4081     u_int16_t bip_sum, int hdrlen)
4082 {
4083         struct pf_state         *s = NULL;
4084         struct pf_src_node      *sn = NULL;
4085         struct tcphdr           *th = pd->hdr.tcp;
4086         u_int16_t                mss = tcp_mssdflt;
4087         u_short                  reason;
4088         int cpu = mycpu->gd_cpuid;
4089
4090         /* check maximums */
4091         if (r->max_states && (r->states_cur >= r->max_states)) {
4092                 pf_status.lcounters[LCNT_STATES]++;
4093                 REASON_SET(&reason, PFRES_MAXSTATES);
4094                 return (PF_DROP);
4095         }
4096         /* src node for filter rule */
4097         if ((r->rule_flag & PFRULE_SRCTRACK ||
4098             r->rpool.opts & PF_POOL_STICKYADDR) &&
4099             pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
4100                 REASON_SET(&reason, PFRES_SRCLIMIT);
4101                 goto csfailed;
4102         }
4103         /* src node for translation rule */
4104         if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
4105             pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
4106                 REASON_SET(&reason, PFRES_SRCLIMIT);
4107                 goto csfailed;
4108         }
4109         s = kmalloc(sizeof(struct pf_state), M_PFSTATEPL, M_NOWAIT|M_ZERO);
4110         if (s == NULL) {
4111                 REASON_SET(&reason, PFRES_MEMORY);
4112                 goto csfailed;
4113         }
4114         lockinit(&s->lk, "pfstlk", 0, 0);
4115         s->id = 0; /* XXX Do we really need that? not in OpenBSD */
4116         s->creatorid = 0;
4117         s->rule.ptr = r;
4118         s->nat_rule.ptr = nr;
4119         s->anchor.ptr = a;
4120         s->state_flags = PFSTATE_CREATEINPROG;
4121         STATE_INC_COUNTERS(s);
4122         if (r->allow_opts)
4123                 s->state_flags |= PFSTATE_ALLOWOPTS;
4124         if (r->rule_flag & PFRULE_STATESLOPPY)
4125                 s->state_flags |= PFSTATE_SLOPPY;
4126         if (pd->not_cpu_localized)
4127                 s->state_flags |= PFSTATE_STACK_GLOBAL;
4128
4129         s->log = r->log & PF_LOG_ALL;
4130         if (nr != NULL)
4131                 s->log |= nr->log & PF_LOG_ALL;
4132         switch (pd->proto) {
4133         case IPPROTO_TCP:
4134                 s->src.seqlo = ntohl(th->th_seq);
4135                 s->src.seqhi = s->src.seqlo + pd->p_len + 1;
4136                 if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
4137                     r->keep_state == PF_STATE_MODULATE) {
4138                         /* Generate sequence number modulator */
4139                         if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
4140                             0)
4141                                 s->src.seqdiff = 1;
4142                         pf_change_a(&th->th_seq, &th->th_sum,
4143                             htonl(s->src.seqlo + s->src.seqdiff), 0);
4144                         *rewrite = 1;
4145                 } else
4146                         s->src.seqdiff = 0;
4147                 if (th->th_flags & TH_SYN) {
4148                         s->src.seqhi++;
4149                         s->src.wscale = pf_get_wscale(m, off,
4150                             th->th_off, pd->af);
4151                 }
4152                 s->src.max_win = MAX(ntohs(th->th_win), 1);
4153                 if (s->src.wscale & PF_WSCALE_MASK) {
4154                         /* Remove scale factor from initial window */
4155                         int win = s->src.max_win;
4156                         win += 1 << (s->src.wscale & PF_WSCALE_MASK);
4157                         s->src.max_win = (win - 1) >>
4158                             (s->src.wscale & PF_WSCALE_MASK);
4159                 }
4160                 if (th->th_flags & TH_FIN)
4161                         s->src.seqhi++;
4162                 s->dst.seqhi = 1;
4163                 s->dst.max_win = 1;
4164                 s->src.state = TCPS_SYN_SENT;
4165                 s->dst.state = TCPS_CLOSED;
4166                 s->timeout = PFTM_TCP_FIRST_PACKET;
4167                 break;
4168         case IPPROTO_UDP:
4169                 s->src.state = PFUDPS_SINGLE;
4170                 s->dst.state = PFUDPS_NO_TRAFFIC;
4171                 s->timeout = PFTM_UDP_FIRST_PACKET;
4172                 break;
4173         case IPPROTO_ICMP:
4174 #ifdef INET6
4175         case IPPROTO_ICMPV6:
4176 #endif
4177                 s->timeout = PFTM_ICMP_FIRST_PACKET;
4178                 break;
4179         default:
4180                 s->src.state = PFOTHERS_SINGLE;
4181                 s->dst.state = PFOTHERS_NO_TRAFFIC;
4182                 s->timeout = PFTM_OTHER_FIRST_PACKET;
4183         }
4184
4185         s->creation = time_second;
4186         s->expire = time_second;
4187
4188         if (sn != NULL) {
4189                 s->src_node = sn;
4190                 s->src_node->states++;
4191         }
4192         if (nsn != NULL) {
4193                 /* XXX We only modify one side for now. */
4194                 PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
4195                 s->nat_src_node = nsn;
4196                 s->nat_src_node->states++;
4197         }
4198         if (pd->proto == IPPROTO_TCP) {
4199                 if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
4200                     off, pd, th, &s->src, &s->dst)) {
4201                         REASON_SET(&reason, PFRES_MEMORY);
4202                         pf_src_tree_remove_state(s);
4203                         STATE_DEC_COUNTERS(s);
4204                         kfree(s, M_PFSTATEPL);
4205                         return (PF_DROP);
4206                 }
4207                 if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
4208                     pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
4209                     &s->src, &s->dst, rewrite)) {
4210                         /* This really shouldn't happen!!! */
4211                         DPFPRINTF(PF_DEBUG_URGENT,
4212                             ("pf_normalize_tcp_stateful failed on first pkt"));
4213                         pf_normalize_tcp_cleanup(s);
4214                         pf_src_tree_remove_state(s);
4215                         STATE_DEC_COUNTERS(s);
4216                         kfree(s, M_PFSTATEPL);
4217                         return (PF_DROP);
4218                 }
4219         }
4220         s->direction = pd->dir;
4221
4222         if (sk == NULL && pf_state_key_setup(pd, nr, &skw, &sks, &sk, &nk,
4223                                              pd->src, pd->dst, sport, dport)) {
4224                 REASON_SET(&reason, PFRES_MEMORY);
4225                 goto csfailed;
4226         }
4227
4228         if (pf_state_insert(BOUND_IFACE(r, kif), skw, sks, s)) {
4229                 if (pd->proto == IPPROTO_TCP)
4230                         pf_normalize_tcp_cleanup(s);
4231                 REASON_SET(&reason, PFRES_STATEINS);
4232                 pf_src_tree_remove_state(s);
4233                 STATE_DEC_COUNTERS(s);
4234                 kfree(s, M_PFSTATEPL);
4235                 return (PF_DROP);
4236         } else
4237                 *sm = s;
4238
4239         pf_set_rt_ifp(s, pd->src);      /* needs s->state_key set */
4240         if (tag > 0) {
4241                 pf_tag_ref(tag);
4242                 s->tag = tag;
4243         }
4244         if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
4245             TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
4246                 s->src.state = PF_TCPS_PROXY_SRC;
4247                 /* undo NAT changes, if they have taken place */
4248                 if (nr != NULL) {
4249                         struct pf_state_key *skt = s->key[PF_SK_WIRE];
4250                         if (pd->dir == PF_OUT)
4251                                 skt = s->key[PF_SK_STACK];
4252                         PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
4253                         PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
4254                         if (pd->sport)
4255                                 *pd->sport = skt->port[pd->sidx];
4256                         if (pd->dport)
4257                                 *pd->dport = skt->port[pd->didx];
4258                         if (pd->proto_sum)
4259                                 *pd->proto_sum = bproto_sum;
4260                         if (pd->ip_sum)
4261                                 *pd->ip_sum = bip_sum;
4262                         m->m_flags &= ~M_HASH;
4263                         m_copyback(m, off, hdrlen, pd->hdr.any);
4264                 }
4265                 s->src.seqhi = htonl(karc4random());
4266                 /* Find mss option */
4267                 mss = pf_get_mss(m, off, th->th_off, pd->af);
4268                 mss = pf_calc_mss(pd->src, pd->af, mss);
4269                 mss = pf_calc_mss(pd->dst, pd->af, mss);
4270                 s->src.mss = mss;
4271                 s->state_flags &= ~PFSTATE_CREATEINPROG;
4272                 pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
4273                             th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
4274                             TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL);
4275                 REASON_SET(&reason, PFRES_SYNPROXY);
4276                 return (PF_SYNPROXY_DROP);
4277         }
4278
4279         s->state_flags &= ~PFSTATE_CREATEINPROG;
4280         return (PF_PASS);
4281
4282 csfailed:
4283         if (sk != NULL)
4284                 kfree(sk, M_PFSTATEKEYPL);
4285         if (nk != NULL)
4286                 kfree(nk, M_PFSTATEKEYPL);
4287
4288         if (sn != NULL && sn->states == 0 && sn->expire == 0) {
4289                 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], sn);
4290                 pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
4291                 atomic_add_int(&pf_status.src_nodes, -1);
4292                 kfree(sn, M_PFSRCTREEPL);
4293         }
4294         if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
4295                 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], nsn);
4296                 pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
4297                 atomic_add_int(&pf_status.src_nodes, -1);
4298                 kfree(nsn, M_PFSRCTREEPL);
4299         }
4300         if (s) {
4301                 pf_src_tree_remove_state(s);
4302                 STATE_DEC_COUNTERS(s);
4303                 kfree(s, M_PFSTATEPL);
4304         }
4305
4306         return (PF_DROP);
4307 }
4308
4309 int
4310 pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
4311     struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
4312     struct pf_ruleset **rsm)
4313 {
4314         struct pf_rule          *r, *a = NULL;
4315         struct pf_ruleset       *ruleset = NULL;
4316         sa_family_t              af = pd->af;
4317         u_short                  reason;
4318         int                      tag = -1;
4319         int                      asd = 0;
4320         int                      match = 0;
4321
4322         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
4323         while (r != NULL) {
4324                 r->evaluations++;
4325                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
4326                         r = r->skip[PF_SKIP_IFP].ptr;
4327                 else if (r->direction && r->direction != direction)
4328                         r = r->skip[PF_SKIP_DIR].ptr;
4329                 else if (r->af && r->af != af)
4330                         r = r->skip[PF_SKIP_AF].ptr;
4331                 else if (r->proto && r->proto != pd->proto)
4332                         r = r->skip[PF_SKIP_PROTO].ptr;
4333                 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
4334                     r->src.neg, kif))
4335                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
4336                 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
4337                     r->dst.neg, NULL))
4338                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
4339                 else if (r->tos && !(r->tos == pd->tos))
4340                         r = TAILQ_NEXT(r, entries);
4341                 else if (r->os_fingerprint != PF_OSFP_ANY)
4342                         r = TAILQ_NEXT(r, entries);
4343                 else if (pd->proto == IPPROTO_UDP &&
4344                     (r->src.port_op || r->dst.port_op))
4345                         r = TAILQ_NEXT(r, entries);
4346                 else if (pd->proto == IPPROTO_TCP &&
4347                     (r->src.port_op || r->dst.port_op || r->flagset))
4348                         r = TAILQ_NEXT(r, entries);
4349                 else if ((pd->proto == IPPROTO_ICMP ||
4350                     pd->proto == IPPROTO_ICMPV6) &&
4351                     (r->type || r->code))
4352                         r = TAILQ_NEXT(r, entries);
4353                 else if (r->prob && r->prob <= karc4random())
4354                         r = TAILQ_NEXT(r, entries);
4355                 else if (r->match_tag && !pf_match_tag(m, r, &tag))
4356                         r = TAILQ_NEXT(r, entries);
4357                 else {
4358                         if (r->anchor == NULL) {
4359                                 match = 1;
4360                                 *rm = r;
4361                                 *am = a;
4362                                 *rsm = ruleset;
4363                                 if ((*rm)->quick)
4364                                         break;
4365                                 r = TAILQ_NEXT(r, entries);
4366                         } else
4367                                 pf_step_into_anchor(&asd, &ruleset,
4368                                     PF_RULESET_FILTER, &r, &a, &match);
4369                 }
4370                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
4371                     PF_RULESET_FILTER, &r, &a, &match))
4372                         break;
4373         }
4374         r = *rm;
4375         a = *am;
4376         ruleset = *rsm;
4377
4378         REASON_SET(&reason, PFRES_MATCH);
4379
4380         if (r->log)
4381                 PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset,
4382                     pd);
4383
4384         if (r->action != PF_PASS)
4385                 return (PF_DROP);
4386
4387         if (pf_tag_packet(m, tag, -1)) {
4388                 REASON_SET(&reason, PFRES_MEMORY);
4389                 return (PF_DROP);
4390         }
4391
4392         return (PF_PASS);
4393 }
4394
4395 /*
4396  * Called with state locked
4397  */
4398 int
4399 pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
4400         struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
4401         struct pf_pdesc *pd, u_short *reason, int *copyback)
4402 {
4403         struct tcphdr           *th = pd->hdr.tcp;
4404         u_int16_t                win = ntohs(th->th_win);
4405         u_int32_t                ack, end, seq, orig_seq;
4406         u_int8_t                 sws, dws;
4407         int                      ackskew;
4408
4409         if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
4410                 sws = src->wscale & PF_WSCALE_MASK;
4411                 dws = dst->wscale & PF_WSCALE_MASK;
4412         } else {
4413                 sws = dws = 0;
4414         }
4415
4416         /*
4417          * Sequence tracking algorithm from Guido van Rooij's paper:
4418          *   http://www.madison-gurkha.com/publications/tcp_filtering/
4419          *      tcp_filtering.ps
4420          */
4421
4422         orig_seq = seq = ntohl(th->th_seq);
4423         if (src->seqlo == 0) {
4424                 /* First packet from this end. Set its state */
4425
4426                 if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
4427                     src->scrub == NULL) {
4428                         if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
4429                                 REASON_SET(reason, PFRES_MEMORY);
4430                                 return (PF_DROP);
4431                         }
4432                 }
4433
4434                 /* Deferred generation of sequence number modulator */
4435                 if (dst->seqdiff && !src->seqdiff) {
4436                         /* use random iss for the TCP server */
4437                         while ((src->seqdiff = karc4random() - seq) == 0)
4438                                 ;
4439                         ack = ntohl(th->th_ack) - dst->seqdiff;
4440                         pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
4441                             src->seqdiff), 0);
4442                         pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
4443                         *copyback = 1;
4444                 } else {
4445                         ack = ntohl(th->th_ack);
4446                 }
4447
4448                 end = seq + pd->p_len;
4449                 if (th->th_flags & TH_SYN) {
4450                         end++;
4451                         (*state)->sync_flags |= PFSTATE_GOT_SYN2;
4452                         if (dst->wscale & PF_WSCALE_FLAG) {
4453                                 src->wscale = pf_get_wscale(m, off, th->th_off,
4454                                     pd->af);
4455                                 if (src->wscale & PF_WSCALE_FLAG) {
4456                                         /* Remove scale factor from initial
4457                                          * window */
4458                                         sws = src->wscale & PF_WSCALE_MASK;
4459                                         win = ((u_int32_t)win + (1 << sws) - 1)
4460                                             >> sws;
4461                                         dws = dst->wscale & PF_WSCALE_MASK;
4462                                 } else {
4463                                         /* fixup other window */
4464                                         dst->max_win <<= dst->wscale &
4465                                             PF_WSCALE_MASK;
4466                                         /* in case of a retrans SYN|ACK */
4467                                         dst->wscale = 0;
4468                                 }
4469                         }
4470                 }
4471                 if (th->th_flags & TH_FIN)
4472                         end++;
4473
4474                 src->seqlo = seq;
4475                 if (src->state < TCPS_SYN_SENT)
4476                         src->state = TCPS_SYN_SENT;
4477
4478                 /*
4479                  * May need to slide the window (seqhi may have been set by
4480                  * the crappy stack check or if we picked up the connection
4481                  * after establishment)
4482                  */
4483                 if (src->seqhi == 1 ||
4484                     SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
4485                         src->seqhi = end + MAX(1, dst->max_win << dws);
4486                 if (win > src->max_win)
4487                         src->max_win = win;
4488
4489         } else {
4490                 ack = ntohl(th->th_ack) - dst->seqdiff;
4491                 if (src->seqdiff) {
4492                         /* Modulate sequence numbers */
4493                         pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
4494                             src->seqdiff), 0);
4495                         pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
4496                         *copyback = 1;
4497                 }
4498                 end = seq + pd->p_len;
4499                 if (th->th_flags & TH_SYN)
4500                         end++;
4501                 if (th->th_flags & TH_FIN)
4502                         end++;
4503         }
4504
4505         if ((th->th_flags & TH_ACK) == 0) {
4506                 /* Let it pass through the ack skew check */
4507                 ack = dst->seqlo;
4508         } else if ((ack == 0 &&
4509             (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
4510             /* broken tcp stacks do not set ack */
4511             (dst->state < TCPS_SYN_SENT)) {
4512                 /*
4513                  * Many stacks (ours included) will set the ACK number in an
4514                  * FIN|ACK if the SYN times out -- no sequence to ACK.
4515                  */
4516                 ack = dst->seqlo;
4517         }
4518
4519         if (seq == end) {
4520                 /* Ease sequencing restrictions on no data packets */
4521                 seq = src->seqlo;
4522                 end = seq;
4523         }
4524
4525         ackskew = dst->seqlo - ack;
4526
4527
4528         /*
4529          * Need to demodulate the sequence numbers in any TCP SACK options
4530          * (Selective ACK). We could optionally validate the SACK values
4531          * against the current ACK window, either forwards or backwards, but
4532          * I'm not confident that SACK has been implemented properly
4533          * everywhere. It wouldn't surprise me if several stacks accidently
4534          * SACK too far backwards of previously ACKed data. There really aren't
4535          * any security implications of bad SACKing unless the target stack
4536          * doesn't validate the option length correctly. Someone trying to
4537          * spoof into a TCP connection won't bother blindly sending SACK
4538          * options anyway.
4539          */
4540         if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
4541                 if (pf_modulate_sack(m, off, pd, th, dst))
4542                         *copyback = 1;
4543         }
4544
4545
4546 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
4547         if (SEQ_GEQ(src->seqhi, end) &&
4548             /* Last octet inside other's window space */
4549             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
4550             /* Retrans: not more than one window back */
4551             (ackskew >= -MAXACKWINDOW) &&
4552             /* Acking not more than one reassembled fragment backwards */
4553             (ackskew <= (MAXACKWINDOW << sws)) &&
4554             /* Acking not more than one window forward */
4555             ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
4556             (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
4557             (pd->flags & PFDESC_IP_REAS) == 0)) {
4558             /* Require an exact/+1 sequence match on resets when possible */
4559
4560                 if (dst->scrub || src->scrub) {
4561                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4562                             *state, src, dst, copyback))
4563                                 return (PF_DROP);
4564                 }
4565
4566                 /* update max window */
4567                 if (src->max_win < win)
4568                         src->max_win = win;
4569                 /* synchronize sequencing */
4570                 if (SEQ_GT(end, src->seqlo))
4571                         src->seqlo = end;
4572                 /* slide the window of what the other end can send */
4573                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4574                         dst->seqhi = ack + MAX((win << sws), 1);
4575
4576
4577                 /* update states */
4578                 if (th->th_flags & TH_SYN)
4579                         if (src->state < TCPS_SYN_SENT)
4580                                 src->state = TCPS_SYN_SENT;
4581                 if (th->th_flags & TH_FIN)
4582                         if (src->state < TCPS_CLOSING)
4583                                 src->state = TCPS_CLOSING;
4584                 if (th->th_flags & TH_ACK) {
4585                         if (dst->state == TCPS_SYN_SENT) {
4586                                 dst->state = TCPS_ESTABLISHED;
4587                                 if (src->state == TCPS_ESTABLISHED &&
4588                                     (*state)->src_node != NULL &&
4589                                     pf_src_connlimit(*state)) {
4590                                         REASON_SET(reason, PFRES_SRCLIMIT);
4591                                         return (PF_DROP);
4592                                 }
4593                         } else if (dst->state == TCPS_CLOSING)
4594                                 dst->state = TCPS_FIN_WAIT_2;
4595                 }
4596                 if (th->th_flags & TH_RST)
4597                         src->state = dst->state = TCPS_TIME_WAIT;
4598
4599                 /* update expire time */
4600                 (*state)->expire = time_second;
4601                 if (src->state >= TCPS_FIN_WAIT_2 &&
4602                     dst->state >= TCPS_FIN_WAIT_2)
4603                         (*state)->timeout = PFTM_TCP_CLOSED;
4604                 else if (src->state >= TCPS_CLOSING &&
4605                     dst->state >= TCPS_CLOSING)
4606                         (*state)->timeout = PFTM_TCP_FIN_WAIT;
4607                 else if (src->state < TCPS_ESTABLISHED ||
4608                     dst->state < TCPS_ESTABLISHED)
4609                         (*state)->timeout = PFTM_TCP_OPENING;
4610                 else if (src->state >= TCPS_CLOSING ||
4611                     dst->state >= TCPS_CLOSING)
4612                         (*state)->timeout = PFTM_TCP_CLOSING;
4613                 else
4614                         (*state)->timeout = PFTM_TCP_ESTABLISHED;
4615
4616                 /* Fall through to PASS packet */
4617
4618         } else if ((dst->state < TCPS_SYN_SENT ||
4619                 dst->state >= TCPS_FIN_WAIT_2 ||
4620                 src->state >= TCPS_FIN_WAIT_2) &&
4621             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
4622             /* Within a window forward of the originating packet */
4623             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
4624             /* Within a window backward of the originating packet */
4625
4626                 /*
4627                  * This currently handles three situations:
4628                  *  1) Stupid stacks will shotgun SYNs before their peer
4629                  *     replies.
4630                  *  2) When PF catches an already established stream (the
4631                  *     firewall rebooted, the state table was flushed, routes
4632                  *     changed...)
4633                  *  3) Packets get funky immediately after the connection
4634                  *     closes (this should catch Solaris spurious ACK|FINs
4635                  *     that web servers like to spew after a close)
4636                  *
4637                  * This must be a little more careful than the above code
4638                  * since packet floods will also be caught here. We don't
4639                  * update the TTL here to mitigate the damage of a packet
4640                  * flood and so the same code can handle awkward establishment
4641                  * and a loosened connection close.
4642                  * In the establishment case, a correct peer response will
4643                  * validate the connection, go through the normal state code
4644                  * and keep updating the state TTL.
4645                  */
4646
4647                 if (pf_status.debug >= PF_DEBUG_MISC) {
4648                         kprintf("pf: loose state match: ");
4649                         pf_print_state(*state);
4650                         pf_print_flags(th->th_flags);
4651                         kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4652                             "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len,
4653                             ackskew, (unsigned long long)(*state)->packets[0],
4654                             (unsigned long long)(*state)->packets[1],
4655                             pd->dir == PF_IN ? "in" : "out",
4656                             pd->dir == (*state)->direction ? "fwd" : "rev");
4657                 }
4658
4659                 if (dst->scrub || src->scrub) {
4660                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4661                             *state, src, dst, copyback))
4662                                 return (PF_DROP);
4663                 }
4664
4665                 /* update max window */
4666                 if (src->max_win < win)
4667                         src->max_win = win;
4668                 /* synchronize sequencing */
4669                 if (SEQ_GT(end, src->seqlo))
4670                         src->seqlo = end;
4671                 /* slide the window of what the other end can send */
4672                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4673                         dst->seqhi = ack + MAX((win << sws), 1);
4674
4675                 /*
4676                  * Cannot set dst->seqhi here since this could be a shotgunned
4677                  * SYN and not an already established connection.
4678                  */
4679
4680                 if (th->th_flags & TH_FIN)
4681                         if (src->state < TCPS_CLOSING)
4682                                 src->state = TCPS_CLOSING;
4683                 if (th->th_flags & TH_RST)
4684                         src->state = dst->state = TCPS_TIME_WAIT;
4685
4686                 /* Fall through to PASS packet */
4687
4688         } else if ((*state)->pickup_mode == PF_PICKUPS_HASHONLY ||
4689                     ((*state)->pickup_mode == PF_PICKUPS_ENABLED &&
4690                      ((*state)->sync_flags & PFSTATE_GOT_SYN_MASK) !=
4691                       PFSTATE_GOT_SYN_MASK)) {
4692                 /*
4693                  * If pickup mode is hash only, do not fail on sequence checks.
4694                  *
4695                  * If pickup mode is enabled and we did not see the SYN in
4696                  * both direction, do not fail on sequence checks because
4697                  * we do not have complete information on window scale.
4698                  *
4699                  * Adjust expiration and fall through to PASS packet.
4700                  * XXX Add a FIN check to reduce timeout?
4701                  */
4702                 (*state)->expire = time_second;
4703         } else  {
4704                 /*
4705                  * Failure processing
4706                  */
4707                 if ((*state)->dst.state == TCPS_SYN_SENT &&
4708                     (*state)->src.state == TCPS_SYN_SENT) {
4709                         /* Send RST for state mismatches during handshake */
4710                         if (!(th->th_flags & TH_RST))
4711                                 pf_send_tcp((*state)->rule.ptr, pd->af,
4712                                     pd->dst, pd->src, th->th_dport,
4713                                     th->th_sport, ntohl(th->th_ack), 0,
4714                                     TH_RST, 0, 0,
4715                                     (*state)->rule.ptr->return_ttl, 1, 0,
4716                                     pd->eh, kif->pfik_ifp);
4717                         src->seqlo = 0;
4718                         src->seqhi = 1;
4719                         src->max_win = 1;
4720                 } else if (pf_status.debug >= PF_DEBUG_MISC) {
4721                         kprintf("pf: BAD state: ");
4722                         pf_print_state(*state);
4723                         pf_print_flags(th->th_flags);
4724                         kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4725                             "pkts=%llu:%llu dir=%s,%s\n",
4726                             seq, orig_seq, ack, pd->p_len, ackskew,
4727                             (unsigned long long)(*state)->packets[0],
4728                                 (unsigned long long)(*state)->packets[1],
4729                             pd->dir == PF_IN ? "in" : "out",
4730                             pd->dir == (*state)->direction ? "fwd" : "rev");
4731                         kprintf("pf: State failure on: %c %c %c %c | %c %c\n",
4732                             SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4733                             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4734                             ' ': '2',
4735                             (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4736                             (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4737                             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4738                             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4739                 }
4740                 REASON_SET(reason, PFRES_BADSTATE);
4741                 return (PF_DROP);
4742         }
4743
4744         return (PF_PASS);
4745 }
4746
4747 /*
4748  * Called with state locked
4749  */
4750 int
4751 pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4752         struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4753 {
4754         struct tcphdr           *th = pd->hdr.tcp;
4755
4756         if (th->th_flags & TH_SYN)
4757                 if (src->state < TCPS_SYN_SENT)
4758                         src->state = TCPS_SYN_SENT;
4759         if (th->th_flags & TH_FIN)
4760                 if (src->state < TCPS_CLOSING)
4761                         src->state = TCPS_CLOSING;
4762         if (th->th_flags & TH_ACK) {
4763                 if (dst->state == TCPS_SYN_SENT) {
4764                         dst->state = TCPS_ESTABLISHED;
4765                         if (src->state == TCPS_ESTABLISHED &&
4766                             (*state)->src_node != NULL &&
4767                             pf_src_connlimit(*state)) {
4768                                 REASON_SET(reason, PFRES_SRCLIMIT);
4769                                 return (PF_DROP);
4770                         }
4771                 } else if (dst->state == TCPS_CLOSING) {
4772                         dst->state = TCPS_FIN_WAIT_2;
4773                 } else if (src->state == TCPS_SYN_SENT &&
4774                     dst->state < TCPS_SYN_SENT) {
4775                         /*
4776                          * Handle a special sloppy case where we only see one
4777                          * half of the connection. If there is a ACK after
4778                          * the initial SYN without ever seeing a packet from
4779                          * the destination, set the connection to established.
4780                          */
4781                         dst->state = src->state = TCPS_ESTABLISHED;
4782                         if ((*state)->src_node != NULL &&
4783                             pf_src_connlimit(*state)) {
4784                                 REASON_SET(reason, PFRES_SRCLIMIT);
4785                                 return (PF_DROP);
4786                         }
4787                 } else if (src->state == TCPS_CLOSING &&
4788                     dst->state == TCPS_ESTABLISHED &&
4789                     dst->seqlo == 0) {
4790                         /*
4791                          * Handle the closing of half connections where we
4792                          * don't see the full bidirectional FIN/ACK+ACK
4793                          * handshake.
4794                          */
4795                         dst->state = TCPS_CLOSING;
4796                 }
4797         }
4798         if (th->th_flags & TH_RST)
4799                 src->state = dst->state = TCPS_TIME_WAIT;
4800
4801         /* update expire time */
4802         (*state)->expire = time_second;
4803         if (src->state >= TCPS_FIN_WAIT_2 &&
4804             dst->state >= TCPS_FIN_WAIT_2)
4805                 (*state)->timeout = PFTM_TCP_CLOSED;
4806         else if (src->state >= TCPS_CLOSING &&
4807             dst->state >= TCPS_CLOSING)
4808                 (*state)->timeout = PFTM_TCP_FIN_WAIT;
4809         else if (src->state < TCPS_ESTABLISHED ||
4810             dst->state < TCPS_ESTABLISHED)
4811                 (*state)->timeout = PFTM_TCP_OPENING;
4812         else if (src->state >= TCPS_CLOSING ||
4813             dst->state >= TCPS_CLOSING)
4814                 (*state)->timeout = PFTM_TCP_CLOSING;
4815         else
4816                 (*state)->timeout = PFTM_TCP_ESTABLISHED;
4817
4818         return (PF_PASS);
4819 }
4820
4821 /*
4822  * Test TCP connection state.  Caller must hold the state locked.
4823  */
4824 int
4825 pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4826                   struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4827                   u_short *reason)
4828 {
4829         struct pf_state_key_cmp  key;
4830         struct tcphdr           *th = pd->hdr.tcp;
4831         int                      copyback = 0;
4832         int                      error;
4833         struct pf_state_peer    *src, *dst;
4834         struct pf_state_key     *sk;
4835
4836         bzero(&key, sizeof(key));
4837         key.af = pd->af;
4838         key.proto = IPPROTO_TCP;
4839         if (direction == PF_IN) {       /* wire side, straight */
4840                 PF_ACPY(&key.addr[0], pd->src, key.af);
4841                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4842                 key.port[0] = th->th_sport;
4843                 key.port[1] = th->th_dport;
4844                 if (pf_status.debug >= PF_DEBUG_MISC) {
4845                         kprintf("test-tcp IN (%08x:%d) -> (%08x:%d)\n",
4846                                 ntohl(key.addr[0].addr32[0]),
4847                                 ntohs(key.port[0]),
4848                                 ntohl(key.addr[1].addr32[0]),
4849                                 ntohs(key.port[1]));
4850                 }
4851         } else {                        /* stack side, reverse */
4852                 PF_ACPY(&key.addr[1], pd->src, key.af);
4853                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4854                 key.port[1] = th->th_sport;
4855                 key.port[0] = th->th_dport;
4856                 if (pf_status.debug >= PF_DEBUG_MISC) {
4857                         kprintf("test-tcp OUT (%08x:%d) <- (%08x:%d)\n",
4858                                 ntohl(key.addr[0].addr32[0]),
4859                                 ntohs(key.port[0]),
4860                                 ntohl(key.addr[1].addr32[0]),
4861                                 ntohs(key.port[1]));
4862                 }
4863         }
4864
4865         STATE_LOOKUP(kif, &key, direction, *state, m);
4866         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
4867
4868         if (direction == (*state)->direction) {
4869                 src = &(*state)->src;
4870                 dst = &(*state)->dst;
4871         } else {
4872                 src = &(*state)->dst;
4873                 dst = &(*state)->src;
4874         }
4875
4876         sk = (*state)->key[pd->didx];
4877
4878         if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4879                 if (direction != (*state)->direction) {
4880                         REASON_SET(reason, PFRES_SYNPROXY);
4881                         FAIL (PF_SYNPROXY_DROP);
4882                 }
4883                 if (th->th_flags & TH_SYN) {
4884                         if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4885                                 REASON_SET(reason, PFRES_SYNPROXY);
4886                                 FAIL (PF_DROP);
4887                         }
4888                         pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
4889                             pd->src, th->th_dport, th->th_sport,
4890                             (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4891                             TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1,
4892                             0, NULL, NULL);
4893                         REASON_SET(reason, PFRES_SYNPROXY);
4894                         FAIL (PF_SYNPROXY_DROP);
4895                 } else if (!(th->th_flags & TH_ACK) ||
4896                     (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4897                     (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4898                         REASON_SET(reason, PFRES_SYNPROXY);
4899                         FAIL (PF_DROP);
4900                 } else if ((*state)->src_node != NULL &&
4901                     pf_src_connlimit(*state)) {
4902                         REASON_SET(reason, PFRES_SRCLIMIT);
4903                         FAIL (PF_DROP);
4904                 } else
4905                         (*state)->src.state = PF_TCPS_PROXY_DST;
4906         }
4907         if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4908                 if (direction == (*state)->direction) {
4909                         if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4910                             (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4911                             (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4912                                 REASON_SET(reason, PFRES_SYNPROXY);
4913                                 FAIL (PF_DROP);
4914                         }
4915                         (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4916                         if ((*state)->dst.seqhi == 1)
4917                                 (*state)->dst.seqhi = htonl(karc4random());
4918                         pf_send_tcp((*state)->rule.ptr, pd->af,
4919                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4920                             sk->port[pd->sidx], sk->port[pd->didx],
4921                             (*state)->dst.seqhi, 0, TH_SYN, 0,
4922                             (*state)->src.mss, 0, 0, (*state)->tag, NULL, NULL);
4923                         REASON_SET(reason, PFRES_SYNPROXY);
4924                         FAIL (PF_SYNPROXY_DROP);
4925                 } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4926                     (TH_SYN|TH_ACK)) ||
4927                     (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4928                         REASON_SET(reason, PFRES_SYNPROXY);
4929                         FAIL (PF_DROP);
4930                 } else {
4931                         (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4932                         (*state)->dst.seqlo = ntohl(th->th_seq);
4933                         pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
4934                             pd->src, th->th_dport, th->th_sport,
4935                             ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4936                             TH_ACK, (*state)->src.max_win, 0, 0, 0,
4937                             (*state)->tag, NULL, NULL);
4938                         pf_send_tcp((*state)->rule.ptr, pd->af,
4939                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4940                             sk->port[pd->sidx], sk->port[pd->didx],
4941                             (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4942                             TH_ACK, (*state)->dst.max_win, 0, 0, 1,
4943                             0, NULL, NULL);
4944                         (*state)->src.seqdiff = (*state)->dst.seqhi -
4945                             (*state)->src.seqlo;
4946                         (*state)->dst.seqdiff = (*state)->src.seqhi -
4947                             (*state)->dst.seqlo;
4948                         (*state)->src.seqhi = (*state)->src.seqlo +
4949                             (*state)->dst.max_win;
4950                         (*state)->dst.seqhi = (*state)->dst.seqlo +
4951                             (*state)->src.max_win;
4952                         (*state)->src.wscale = (*state)->dst.wscale = 0;
4953                         (*state)->src.state = (*state)->dst.state =
4954                             TCPS_ESTABLISHED;
4955                         REASON_SET(reason, PFRES_SYNPROXY);
4956                         FAIL (PF_SYNPROXY_DROP);
4957                 }
4958         }
4959
4960         /*
4961          * Check for connection (addr+port pair) reuse.  We can't actually
4962          * unlink the state if we don't own it.
4963          */
4964         if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4965             dst->state >= TCPS_FIN_WAIT_2 &&
4966             src->state >= TCPS_FIN_WAIT_2) {
4967                 if (pf_status.debug >= PF_DEBUG_MISC) {
4968                         kprintf("pf: state reuse ");
4969                         pf_print_state(*state);
4970                         pf_print_flags(th->th_flags);
4971                         kprintf("\n");
4972                 }
4973                 /* XXX make sure it's the same direction ?? */
4974                 (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4975                 if ((*state)->cpuid == mycpu->gd_cpuid) {
4976                         pf_unlink_state(*state);
4977                         *state = NULL;
4978                 } else {
4979                         (*state)->timeout = PFTM_PURGE;
4980                 }
4981                 FAIL (PF_DROP);
4982         }
4983
4984         if ((*state)->state_flags & PFSTATE_SLOPPY) {
4985                 if (pf_tcp_track_sloppy(src, dst, state, pd,
4986                                         reason) == PF_DROP) {
4987                         FAIL (PF_DROP);
4988                 }
4989         } else {
4990                 if (pf_tcp_track_full(src, dst, state, kif, m, off, pd,
4991                                       reason, &copyback) == PF_DROP) {
4992                         FAIL (PF_DROP);
4993                 }
4994         }
4995
4996         /* translate source/destination address, if necessary */
4997         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4998                 struct pf_state_key *nk = (*state)->key[pd->didx];
4999
5000                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5001                     nk->port[pd->sidx] != th->th_sport)  {
5002                         /*
5003                          * The translated source address may be completely
5004                          * unrelated to the saved link header, make sure
5005                          * a bridge doesn't try to use it.
5006                          */
5007                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
5008                         pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
5009                             &th->th_sum, &nk->addr[pd->sidx],
5010                             nk->port[pd->sidx], 0, pd->af);
5011                 }
5012
5013                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5014                     nk->port[pd->didx] != th->th_dport) {
5015                         /*
5016                          * If we don't redispatch the packet will go into
5017                          * the protocol stack on the wrong cpu for the
5018                          * post-translated address.
5019                          */
5020                         pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
5021                             &th->th_sum, &nk->addr[pd->didx],
5022                             nk->port[pd->didx], 0, pd->af);
5023                 }
5024                 copyback = 1;
5025         }
5026
5027         /* Copyback sequence modulation or stateful scrub changes if needed */
5028         if (copyback) {
5029                 m->m_flags &= ~M_HASH;
5030                 m_copyback(m, off, sizeof(*th), (caddr_t)th);
5031         }
5032
5033         pfsync_update_state(*state);
5034         error = PF_PASS;
5035 done:
5036         if (*state)
5037                 lockmgr(&(*state)->lk, LK_RELEASE);
5038         return (error);
5039 }
5040
5041 /*
5042  * Test UDP connection state.  Caller must hold the state locked.
5043  */
5044 int
5045 pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
5046                   struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
5047 {
5048         struct pf_state_peer    *src, *dst;
5049         struct pf_state_key_cmp  key;
5050         struct udphdr           *uh = pd->hdr.udp;
5051
5052         bzero(&key, sizeof(key));
5053         key.af = pd->af;
5054         key.proto = IPPROTO_UDP;
5055         if (direction == PF_IN) {       /* wire side, straight */
5056                 PF_ACPY(&key.addr[0], pd->src, key.af);
5057                 PF_ACPY(&key.addr[1], pd->dst, key.af);
5058                 key.port[0] = uh->uh_sport;
5059                 key.port[1] = uh->uh_dport;
5060         } else {                        /* stack side, reverse */
5061                 PF_ACPY(&key.addr[1], pd->src, key.af);
5062                 PF_ACPY(&key.addr[0], pd->dst, key.af);
5063                 key.port[1] = uh->uh_sport;
5064                 key.port[0] = uh->uh_dport;
5065         }
5066
5067         STATE_LOOKUP(kif, &key, direction, *state, m);
5068         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5069
5070         if (direction == (*state)->direction) {
5071                 src = &(*state)->src;
5072                 dst = &(*state)->dst;
5073         } else {
5074                 src = &(*state)->dst;
5075                 dst = &(*state)->src;
5076         }
5077
5078         /* update states */
5079         if (src->state < PFUDPS_SINGLE)
5080                 src->state = PFUDPS_SINGLE;
5081         if (dst->state == PFUDPS_SINGLE)
5082                 dst->state = PFUDPS_MULTIPLE;
5083
5084         /* update expire time */
5085         (*state)->expire = time_second;
5086         if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
5087                 (*state)->timeout = PFTM_UDP_MULTIPLE;
5088         else
5089                 (*state)->timeout = PFTM_UDP_SINGLE;
5090
5091         /* translate source/destination address, if necessary */
5092         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5093                 struct pf_state_key *nk = (*state)->key[pd->didx];
5094
5095                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5096                     nk->port[pd->sidx] != uh->uh_sport) {
5097                         /*
5098                          * The translated source address may be completely
5099                          * unrelated to the saved link header, make sure
5100                          * a bridge doesn't try to use it.
5101                          */
5102                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
5103                         m->m_flags &= ~M_HASH;
5104                         pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
5105                             &uh->uh_sum, &nk->addr[pd->sidx],
5106                             nk->port[pd->sidx], 1, pd->af);
5107                 }
5108
5109                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5110                     nk->port[pd->didx] != uh->uh_dport) {
5111                         /*
5112                          * If we don't redispatch the packet will go into
5113                          * the protocol stack on the wrong cpu for the
5114                          * post-translated address.
5115                          */
5116                         m->m_flags &= ~M_HASH;
5117                         pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
5118                             &uh->uh_sum, &nk->addr[pd->didx],
5119                             nk->port[pd->didx], 1, pd->af);
5120                 }
5121                 m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
5122         }
5123
5124         pfsync_update_state(*state);
5125         lockmgr(&(*state)->lk, LK_RELEASE);
5126         return (PF_PASS);
5127 }
5128
5129 /*
5130  * Test ICMP connection state.  Caller must hold the state locked.
5131  */
5132 int
5133 pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
5134                    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
5135                    u_short *reason)
5136 {
5137         struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
5138         u_int16_t        icmpid = 0, *icmpsum = NULL;
5139         u_int8_t         icmptype = 0;
5140         int              state_icmp = 0;
5141         int              error;
5142         struct pf_state_key_cmp key;
5143
5144         bzero(&key, sizeof(key));
5145
5146         switch (pd->proto) {
5147 #ifdef INET
5148         case IPPROTO_ICMP:
5149                 icmptype = pd->hdr.icmp->icmp_type;
5150                 icmpid = pd->hdr.icmp->icmp_id;
5151                 icmpsum = &pd->hdr.icmp->icmp_cksum;
5152
5153                 if (icmptype == ICMP_UNREACH ||
5154                     icmptype == ICMP_SOURCEQUENCH ||
5155                     icmptype == ICMP_REDIRECT ||
5156                     icmptype == ICMP_TIMXCEED ||
5157                     icmptype == ICMP_PARAMPROB)
5158                         state_icmp++;
5159                 break;
5160 #endif /* INET */
5161 #ifdef INET6
5162         case IPPROTO_ICMPV6:
5163                 icmptype = pd->hdr.icmp6->icmp6_type;
5164                 icmpid = pd->hdr.icmp6->icmp6_id;
5165                 icmpsum = &pd->hdr.icmp6->icmp6_cksum;
5166
5167                 if (icmptype == ICMP6_DST_UNREACH ||
5168                     icmptype == ICMP6_PACKET_TOO_BIG ||
5169                     icmptype == ICMP6_TIME_EXCEEDED ||
5170                     icmptype == ICMP6_PARAM_PROB)
5171                         state_icmp++;
5172                 break;
5173 #endif /* INET6 */
5174         }
5175
5176         if (!state_icmp) {
5177
5178                 /*
5179                  * ICMP query/reply message not related to a TCP/UDP packet.
5180                  * Search for an ICMP state.
5181                  */
5182                 key.af = pd->af;
5183                 key.proto = pd->proto;
5184                 key.port[0] = key.port[1] = icmpid;
5185                 if (direction == PF_IN) {       /* wire side, straight */
5186                         PF_ACPY(&key.addr[0], pd->src, key.af);
5187                         PF_ACPY(&key.addr[1], pd->dst, key.af);
5188                 } else {                        /* stack side, reverse */
5189                         PF_ACPY(&key.addr[1], pd->src, key.af);
5190                         PF_ACPY(&key.addr[0], pd->dst, key.af);
5191                 }
5192
5193                 STATE_LOOKUP(kif, &key, direction, *state, m);
5194                 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5195
5196                 (*state)->expire = time_second;
5197                 (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
5198
5199                 /* translate source/destination address, if necessary */
5200                 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5201                         struct pf_state_key *nk = (*state)->key[pd->didx];
5202
5203                         switch (pd->af) {
5204 #ifdef INET
5205                         case AF_INET:
5206                                 if (PF_ANEQ(pd->src,
5207                                     &nk->addr[pd->sidx], AF_INET))
5208                                         pf_change_a(&saddr->v4.s_addr,
5209                                             pd->ip_sum,
5210                                             nk->addr[pd->sidx].v4.s_addr, 0);
5211
5212                                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
5213                                     AF_INET))
5214                                         pf_change_a(&daddr->v4.s_addr,
5215                                             pd->ip_sum,
5216                                             nk->addr[pd->didx].v4.s_addr, 0);
5217
5218                                 if (nk->port[0] !=
5219                                     pd->hdr.icmp->icmp_id) {
5220                                         pd->hdr.icmp->icmp_cksum =
5221                                             pf_cksum_fixup(
5222                                             pd->hdr.icmp->icmp_cksum, icmpid,
5223                                             nk->port[pd->sidx], 0);
5224                                         pd->hdr.icmp->icmp_id =
5225                                             nk->port[pd->sidx];
5226                                 }
5227
5228                                 m->m_flags &= ~M_HASH;
5229                                 m_copyback(m, off, ICMP_MINLEN,
5230                                     (caddr_t)pd->hdr.icmp);
5231                                 break;
5232 #endif /* INET */
5233 #ifdef INET6
5234                         case AF_INET6:
5235                                 if (PF_ANEQ(pd->src,
5236                                     &nk->addr[pd->sidx], AF_INET6))
5237                                         pf_change_a6(saddr,
5238                                             &pd->hdr.icmp6->icmp6_cksum,
5239                                             &nk->addr[pd->sidx], 0);
5240
5241                                 if (PF_ANEQ(pd->dst,
5242                                     &nk->addr[pd->didx], AF_INET6))
5243                                         pf_change_a6(daddr,
5244                                             &pd->hdr.icmp6->icmp6_cksum,
5245                                             &nk->addr[pd->didx], 0);
5246
5247                                 m->m_flags &= ~M_HASH;
5248                                 m_copyback(m, off,
5249                                         sizeof(struct icmp6_hdr),
5250                                         (caddr_t)pd->hdr.icmp6);
5251                                 break;
5252 #endif /* INET6 */
5253                         }
5254                 }
5255         } else {
5256                 /*
5257                  * ICMP error message in response to a TCP/UDP packet.
5258                  * Extract the inner TCP/UDP header and search for that state.
5259                  */
5260
5261                 struct pf_pdesc pd2;
5262 #ifdef INET
5263                 struct ip       h2;
5264 #endif /* INET */
5265 #ifdef INET6
5266                 struct ip6_hdr  h2_6;
5267                 int             terminal = 0;
5268 #endif /* INET6 */
5269                 int             ipoff2;
5270                 int             off2;
5271
5272                 pd2.not_cpu_localized = 1;
5273                 pd2.af = pd->af;
5274                 /* Payload packet is from the opposite direction. */
5275                 pd2.sidx = (direction == PF_IN) ? 1 : 0;
5276                 pd2.didx = (direction == PF_IN) ? 0 : 1;
5277                 switch (pd->af) {
5278 #ifdef INET
5279                 case AF_INET:
5280                         /* offset of h2 in mbuf chain */
5281                         ipoff2 = off + ICMP_MINLEN;
5282
5283                         if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
5284                             NULL, reason, pd2.af)) {
5285                                 DPFPRINTF(PF_DEBUG_MISC,
5286                                     ("pf: ICMP error message too short "
5287                                     "(ip)\n"));
5288                                 FAIL (PF_DROP);
5289                         }
5290                         /*
5291                          * ICMP error messages don't refer to non-first
5292                          * fragments
5293                          */
5294                         if (h2.ip_off & htons(IP_OFFMASK)) {
5295                                 REASON_SET(reason, PFRES_FRAG);
5296                                 FAIL (PF_DROP);
5297                         }
5298
5299                         /* offset of protocol header that follows h2 */
5300                         off2 = ipoff2 + (h2.ip_hl << 2);
5301
5302                         pd2.proto = h2.ip_p;
5303                         pd2.src = (struct pf_addr *)&h2.ip_src;
5304                         pd2.dst = (struct pf_addr *)&h2.ip_dst;
5305                         pd2.ip_sum = &h2.ip_sum;
5306                         break;
5307 #endif /* INET */
5308 #ifdef INET6
5309                 case AF_INET6:
5310                         ipoff2 = off + sizeof(struct icmp6_hdr);
5311
5312                         if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
5313                             NULL, reason, pd2.af)) {
5314                                 DPFPRINTF(PF_DEBUG_MISC,
5315                                     ("pf: ICMP error message too short "
5316                                     "(ip6)\n"));
5317                                 FAIL (PF_DROP);
5318                         }
5319                         pd2.proto = h2_6.ip6_nxt;
5320                         pd2.src = (struct pf_addr *)&h2_6.ip6_src;
5321                         pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
5322                         pd2.ip_sum = NULL;
5323                         off2 = ipoff2 + sizeof(h2_6);
5324                         do {
5325                                 switch (pd2.proto) {
5326                                 case IPPROTO_FRAGMENT:
5327                                         /*
5328                                          * ICMPv6 error messages for
5329                                          * non-first fragments
5330                                          */
5331                                         REASON_SET(reason, PFRES_FRAG);
5332                                         FAIL (PF_DROP);
5333                                 case IPPROTO_AH:
5334                                 case IPPROTO_HOPOPTS:
5335                                 case IPPROTO_ROUTING:
5336                                 case IPPROTO_DSTOPTS: {
5337                                         /* get next header and header length */
5338                                         struct ip6_ext opt6;
5339
5340                                         if (!pf_pull_hdr(m, off2, &opt6,
5341                                             sizeof(opt6), NULL, reason,
5342                                             pd2.af)) {
5343                                                 DPFPRINTF(PF_DEBUG_MISC,
5344                                                     ("pf: ICMPv6 short opt\n"));
5345                                                 FAIL (PF_DROP);
5346                                         }
5347                                         if (pd2.proto == IPPROTO_AH)
5348                                                 off2 += (opt6.ip6e_len + 2) * 4;
5349                                         else
5350                                                 off2 += (opt6.ip6e_len + 1) * 8;
5351                                         pd2.proto = opt6.ip6e_nxt;
5352                                         /* goto the next header */
5353                                         break;
5354                                 }
5355                                 default:
5356                                         terminal++;
5357                                         break;
5358                                 }
5359                         } while (!terminal);
5360                         break;
5361 #endif /* INET6 */
5362                 default:
5363                         DPFPRINTF(PF_DEBUG_MISC,
5364                             ("pf: ICMP AF %d unknown (ip6)\n", pd->af));
5365                         FAIL (PF_DROP);
5366                         break;
5367                 }
5368
5369                 switch (pd2.proto) {
5370                 case IPPROTO_TCP: {
5371                         struct tcphdr            th;
5372                         u_int32_t                seq;
5373                         struct pf_state_peer    *src, *dst;
5374                         u_int8_t                 dws;
5375                         int                      copyback = 0;
5376
5377                         /*
5378                          * Only the first 8 bytes of the TCP header can be
5379                          * expected. Don't access any TCP header fields after
5380                          * th_seq, an ackskew test is not possible.
5381                          */
5382                         if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
5383                             pd2.af)) {
5384                                 DPFPRINTF(PF_DEBUG_MISC,
5385                                     ("pf: ICMP error message too short "
5386                                     "(tcp)\n"));
5387                                 FAIL (PF_DROP);
5388                         }
5389
5390                         key.af = pd2.af;
5391                         key.proto = IPPROTO_TCP;
5392                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5393                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5394                         key.port[pd2.sidx] = th.th_sport;
5395                         key.port[pd2.didx] = th.th_dport;
5396
5397                         STATE_LOOKUP(kif, &key, direction, *state, m);
5398                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5399
5400                         if (direction == (*state)->direction) {
5401                                 src = &(*state)->dst;
5402                                 dst = &(*state)->src;
5403                         } else {
5404                                 src = &(*state)->src;
5405                                 dst = &(*state)->dst;
5406                         }
5407
5408                         if (src->wscale && dst->wscale)
5409                                 dws = dst->wscale & PF_WSCALE_MASK;
5410                         else
5411                                 dws = 0;
5412
5413                         /* Demodulate sequence number */
5414                         seq = ntohl(th.th_seq) - src->seqdiff;
5415                         if (src->seqdiff) {
5416                                 pf_change_a(&th.th_seq, icmpsum,
5417                                     htonl(seq), 0);
5418                                 copyback = 1;
5419                         }
5420
5421                         if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
5422                             (!SEQ_GEQ(src->seqhi, seq) ||
5423                             !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
5424                                 if (pf_status.debug >= PF_DEBUG_MISC) {
5425                                         kprintf("pf: BAD ICMP %d:%d ",
5426                                             icmptype, pd->hdr.icmp->icmp_code);
5427                                         pf_print_host(pd->src, 0, pd->af);
5428                                         kprintf(" -> ");
5429                                         pf_print_host(pd->dst, 0, pd->af);
5430                                         kprintf(" state: ");
5431                                         pf_print_state(*state);
5432                                         kprintf(" seq=%u\n", seq);
5433                                 }
5434                                 REASON_SET(reason, PFRES_BADSTATE);
5435                                 FAIL (PF_DROP);
5436                         } else {
5437                                 if (pf_status.debug >= PF_DEBUG_MISC) {
5438                                         kprintf("pf: OK ICMP %d:%d ",
5439                                             icmptype, pd->hdr.icmp->icmp_code);
5440                                         pf_print_host(pd->src, 0, pd->af);
5441                                         kprintf(" -> ");
5442                                         pf_print_host(pd->dst, 0, pd->af);
5443                                         kprintf(" state: ");
5444                                         pf_print_state(*state);
5445                                         kprintf(" seq=%u\n", seq);
5446                                 }
5447                         }
5448
5449                         /* translate source/destination address, if necessary */
5450                         if ((*state)->key[PF_SK_WIRE] !=
5451                             (*state)->key[PF_SK_STACK]) {
5452                                 struct pf_state_key *nk =
5453                                     (*state)->key[pd->didx];
5454
5455                                 if (PF_ANEQ(pd2.src,
5456                                     &nk->addr[pd2.sidx], pd2.af) ||
5457                                     nk->port[pd2.sidx] != th.th_sport)
5458                                         pf_change_icmp(pd2.src, &th.th_sport,
5459                                             daddr, &nk->addr[pd2.sidx],
5460                                             nk->port[pd2.sidx], NULL,
5461                                             pd2.ip_sum, icmpsum,
5462                                             pd->ip_sum, 0, pd2.af);
5463
5464                                 if (PF_ANEQ(pd2.dst,
5465                                     &nk->addr[pd2.didx], pd2.af) ||
5466                                     nk->port[pd2.didx] != th.th_dport)
5467                                         pf_change_icmp(pd2.dst, &th.th_dport,
5468                                             NULL, /* XXX Inbound NAT? */
5469                                             &nk->addr[pd2.didx],
5470                                             nk->port[pd2.didx], NULL,
5471                                             pd2.ip_sum, icmpsum,
5472                                             pd->ip_sum, 0, pd2.af);
5473                                 copyback = 1;
5474                         }
5475
5476                         if (copyback) {
5477                                 switch (pd2.af) {
5478 #ifdef INET
5479                                 case AF_INET:
5480                                         m_copyback(m, off, ICMP_MINLEN,
5481                                             (caddr_t)pd->hdr.icmp);
5482                                         m_copyback(m, ipoff2, sizeof(h2),
5483                                             (caddr_t)&h2);
5484                                         break;
5485 #endif /* INET */
5486 #ifdef INET6
5487                                 case AF_INET6:
5488                                         m_copyback(m, off,
5489                                             sizeof(struct icmp6_hdr),
5490                                             (caddr_t)pd->hdr.icmp6);
5491                                         m_copyback(m, ipoff2, sizeof(h2_6),
5492                                             (caddr_t)&h2_6);
5493                                         break;
5494 #endif /* INET6 */
5495                                 }
5496                                 m->m_flags &= ~M_HASH;
5497                                 m_copyback(m, off2, 8, (caddr_t)&th);
5498                         }
5499                         break;
5500                 }
5501                 case IPPROTO_UDP: {
5502                         struct udphdr           uh;
5503
5504                         if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
5505                             NULL, reason, pd2.af)) {
5506                                 DPFPRINTF(PF_DEBUG_MISC,
5507                                     ("pf: ICMP error message too short "
5508                                     "(udp)\n"));
5509                                 return (PF_DROP);
5510                         }
5511
5512                         key.af = pd2.af;
5513                         key.proto = IPPROTO_UDP;
5514                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5515                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5516                         key.port[pd2.sidx] = uh.uh_sport;
5517                         key.port[pd2.didx] = uh.uh_dport;
5518
5519                         STATE_LOOKUP(kif, &key, direction, *state, m);
5520                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5521
5522                         /* translate source/destination address, if necessary */
5523                         if ((*state)->key[PF_SK_WIRE] !=
5524                             (*state)->key[PF_SK_STACK]) {
5525                                 struct pf_state_key *nk =
5526                                     (*state)->key[pd->didx];
5527
5528                                 if (PF_ANEQ(pd2.src,
5529                                     &nk->addr[pd2.sidx], pd2.af) ||
5530                                     nk->port[pd2.sidx] != uh.uh_sport)
5531                                         pf_change_icmp(pd2.src, &uh.uh_sport,
5532                                             daddr, &nk->addr[pd2.sidx],
5533                                             nk->port[pd2.sidx], &uh.uh_sum,
5534                                             pd2.ip_sum, icmpsum,
5535                                             pd->ip_sum, 1, pd2.af);
5536
5537                                 if (PF_ANEQ(pd2.dst,
5538                                     &nk->addr[pd2.didx], pd2.af) ||
5539                                     nk->port[pd2.didx] != uh.uh_dport)
5540                                         pf_change_icmp(pd2.dst, &uh.uh_dport,
5541                                             NULL, /* XXX Inbound NAT? */
5542                                             &nk->addr[pd2.didx],
5543                                             nk->port[pd2.didx], &uh.uh_sum,
5544                                             pd2.ip_sum, icmpsum,
5545                                             pd->ip_sum, 1, pd2.af);
5546
5547                                 switch (pd2.af) {
5548 #ifdef INET
5549                                 case AF_INET:
5550                                         m_copyback(m, off, ICMP_MINLEN,
5551                                             (caddr_t)pd->hdr.icmp);
5552                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5553                                         break;
5554 #endif /* INET */
5555 #ifdef INET6
5556                                 case AF_INET6:
5557                                         m_copyback(m, off,
5558                                             sizeof(struct icmp6_hdr),
5559                                             (caddr_t)pd->hdr.icmp6);
5560                                         m_copyback(m, ipoff2, sizeof(h2_6),
5561                                             (caddr_t)&h2_6);
5562                                         break;
5563 #endif /* INET6 */
5564                                 }
5565                                 m->m_flags &= ~M_HASH;
5566                                 m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
5567                         }
5568                         break;
5569                 }
5570 #ifdef INET
5571                 case IPPROTO_ICMP: {
5572                         struct icmp             iih;
5573
5574                         if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
5575                             NULL, reason, pd2.af)) {
5576                                 DPFPRINTF(PF_DEBUG_MISC,
5577                                     ("pf: ICMP error message too short i"
5578                                     "(icmp)\n"));
5579                                 return (PF_DROP);
5580                         }
5581
5582                         key.af = pd2.af;
5583                         key.proto = IPPROTO_ICMP;
5584                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5585                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5586                         key.port[0] = key.port[1] = iih.icmp_id;
5587
5588                         STATE_LOOKUP(kif, &key, direction, *state, m);
5589                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5590
5591                         /* translate source/destination address, if necessary */
5592                         if ((*state)->key[PF_SK_WIRE] !=
5593                             (*state)->key[PF_SK_STACK]) {
5594                                 struct pf_state_key *nk =
5595                                     (*state)->key[pd->didx];
5596
5597                                 if (PF_ANEQ(pd2.src,
5598                                     &nk->addr[pd2.sidx], pd2.af) ||
5599                                     nk->port[pd2.sidx] != iih.icmp_id)
5600                                         pf_change_icmp(pd2.src, &iih.icmp_id,
5601                                             daddr, &nk->addr[pd2.sidx],
5602                                             nk->port[pd2.sidx], NULL,
5603                                             pd2.ip_sum, icmpsum,
5604                                             pd->ip_sum, 0, AF_INET);
5605
5606                                 if (PF_ANEQ(pd2.dst,
5607                                     &nk->addr[pd2.didx], pd2.af) ||
5608                                     nk->port[pd2.didx] != iih.icmp_id)
5609                                         pf_change_icmp(pd2.dst, &iih.icmp_id,
5610                                             NULL, /* XXX Inbound NAT? */
5611                                             &nk->addr[pd2.didx],
5612                                             nk->port[pd2.didx], NULL,
5613                                             pd2.ip_sum, icmpsum,
5614                                             pd->ip_sum, 0, AF_INET);
5615
5616                                 m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
5617                                 m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5618                                 m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
5619                                 m->m_flags &= ~M_HASH;
5620                         }
5621                         break;
5622                 }
5623 #endif /* INET */
5624 #ifdef INET6
5625                 case IPPROTO_ICMPV6: {
5626                         struct icmp6_hdr        iih;
5627
5628                         if (!pf_pull_hdr(m, off2, &iih,
5629                             sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
5630                                 DPFPRINTF(PF_DEBUG_MISC,
5631                                     ("pf: ICMP error message too short "
5632                                     "(icmp6)\n"));
5633                                 FAIL (PF_DROP);
5634                         }
5635
5636                         key.af = pd2.af;
5637                         key.proto = IPPROTO_ICMPV6;
5638                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5639                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5640                         key.port[0] = key.port[1] = iih.icmp6_id;
5641
5642                         STATE_LOOKUP(kif, &key, direction, *state, m);
5643                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5644
5645                         /* translate source/destination address, if necessary */
5646                         if ((*state)->key[PF_SK_WIRE] !=
5647                             (*state)->key[PF_SK_STACK]) {
5648                                 struct pf_state_key *nk =
5649                                     (*state)->key[pd->didx];
5650
5651                                 if (PF_ANEQ(pd2.src,
5652                                     &nk->addr[pd2.sidx], pd2.af) ||
5653                                     nk->port[pd2.sidx] != iih.icmp6_id)
5654                                         pf_change_icmp(pd2.src, &iih.icmp6_id,
5655                                             daddr, &nk->addr[pd2.sidx],
5656                                             nk->port[pd2.sidx], NULL,
5657                                             pd2.ip_sum, icmpsum,
5658                                             pd->ip_sum, 0, AF_INET6);
5659
5660                                 if (PF_ANEQ(pd2.dst,
5661                                     &nk->addr[pd2.didx], pd2.af) ||
5662                                     nk->port[pd2.didx] != iih.icmp6_id)
5663                                         pf_change_icmp(pd2.dst, &iih.icmp6_id,
5664                                             NULL, /* XXX Inbound NAT? */
5665                                             &nk->addr[pd2.didx],
5666                                             nk->port[pd2.didx], NULL,
5667                                             pd2.ip_sum, icmpsum,
5668                                             pd->ip_sum, 0, AF_INET6);
5669
5670                                 m_copyback(m, off, sizeof(struct icmp6_hdr),
5671                                     (caddr_t)pd->hdr.icmp6);
5672                                 m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
5673                                 m_copyback(m, off2, sizeof(struct icmp6_hdr),
5674                                     (caddr_t)&iih);
5675                                 m->m_flags &= ~M_HASH;
5676                         }
5677                         break;
5678                 }
5679 #endif /* INET6 */
5680                 default: {
5681                         key.af = pd2.af;
5682                         key.proto = pd2.proto;
5683                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5684                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5685                         key.port[0] = key.port[1] = 0;
5686
5687                         STATE_LOOKUP(kif, &key, direction, *state, m);
5688                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5689
5690                         /* translate source/destination address, if necessary */
5691                         if ((*state)->key[PF_SK_WIRE] !=
5692                             (*state)->key[PF_SK_STACK]) {
5693                                 struct pf_state_key *nk =
5694                                     (*state)->key[pd->didx];
5695
5696                                 if (PF_ANEQ(pd2.src,
5697                                     &nk->addr[pd2.sidx], pd2.af))
5698                                         pf_change_icmp(pd2.src, NULL, daddr,
5699                                             &nk->addr[pd2.sidx], 0, NULL,
5700                                             pd2.ip_sum, icmpsum,
5701                                             pd->ip_sum, 0, pd2.af);
5702
5703                                 if (PF_ANEQ(pd2.dst,
5704                                     &nk->addr[pd2.didx], pd2.af))
5705                                         pf_change_icmp(pd2.src, NULL,
5706                                             NULL, /* XXX Inbound NAT? */
5707                                             &nk->addr[pd2.didx], 0, NULL,
5708                                             pd2.ip_sum, icmpsum,
5709                                             pd->ip_sum, 0, pd2.af);
5710
5711                                 switch (pd2.af) {
5712 #ifdef INET
5713                                 case AF_INET:
5714                                         m_copyback(m, off, ICMP_MINLEN,
5715                                             (caddr_t)pd->hdr.icmp);
5716                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5717                                         m->m_flags &= ~M_HASH;
5718                                         break;
5719 #endif /* INET */
5720 #ifdef INET6
5721                                 case AF_INET6:
5722                                         m_copyback(m, off,
5723                                             sizeof(struct icmp6_hdr),
5724                                             (caddr_t)pd->hdr.icmp6);
5725                                         m_copyback(m, ipoff2, sizeof(h2_6),
5726                                             (caddr_t)&h2_6);
5727                                         m->m_flags &= ~M_HASH;
5728                                         break;
5729 #endif /* INET6 */
5730                                 }
5731                         }
5732                         break;
5733                 }
5734                 }
5735         }
5736
5737         pfsync_update_state(*state);
5738         error = PF_PASS;
5739 done:
5740         if (*state)
5741                 lockmgr(&(*state)->lk, LK_RELEASE);
5742         return (error);
5743 }
5744
5745 /*
5746  * Test other connection state.  Caller must hold the state locked.
5747  */
5748 int
5749 pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
5750                     struct mbuf *m, struct pf_pdesc *pd)
5751 {
5752         struct pf_state_peer    *src, *dst;
5753         struct pf_state_key_cmp  key;
5754
5755         bzero(&key, sizeof(key));
5756         key.af = pd->af;
5757         key.proto = pd->proto;
5758         if (direction == PF_IN) {
5759                 PF_ACPY(&key.addr[0], pd->src, key.af);
5760                 PF_ACPY(&key.addr[1], pd->dst, key.af);
5761                 key.port[0] = key.port[1] = 0;
5762         } else {
5763                 PF_ACPY(&key.addr[1], pd->src, key.af);
5764                 PF_ACPY(&key.addr[0], pd->dst, key.af);
5765                 key.port[1] = key.port[0] = 0;
5766         }
5767
5768         STATE_LOOKUP(kif, &key, direction, *state, m);
5769         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5770
5771         if (direction == (*state)->direction) {
5772                 src = &(*state)->src;
5773                 dst = &(*state)->dst;
5774         } else {
5775                 src = &(*state)->dst;
5776                 dst = &(*state)->src;
5777         }
5778
5779         /* update states */
5780         if (src->state < PFOTHERS_SINGLE)
5781                 src->state = PFOTHERS_SINGLE;
5782         if (dst->state == PFOTHERS_SINGLE)
5783                 dst->state = PFOTHERS_MULTIPLE;
5784
5785         /* update expire time */
5786         (*state)->expire = time_second;
5787         if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5788                 (*state)->timeout = PFTM_OTHER_MULTIPLE;
5789         else
5790                 (*state)->timeout = PFTM_OTHER_SINGLE;
5791
5792         /* translate source/destination address, if necessary */
5793         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5794                 struct pf_state_key *nk = (*state)->key[pd->didx];
5795
5796                 KKASSERT(nk);
5797                 KKASSERT(pd);
5798                 KKASSERT(pd->src);
5799                 KKASSERT(pd->dst);
5800                 switch (pd->af) {
5801 #ifdef INET
5802                 case AF_INET:
5803                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5804                                 pf_change_a(&pd->src->v4.s_addr,
5805                                     pd->ip_sum,
5806                                     nk->addr[pd->sidx].v4.s_addr,
5807                                     0);
5808
5809
5810                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5811                                 pf_change_a(&pd->dst->v4.s_addr,
5812                                     pd->ip_sum,
5813                                     nk->addr[pd->didx].v4.s_addr,
5814                                     0);
5815
5816                         break;
5817 #endif /* INET */
5818 #ifdef INET6
5819                 case AF_INET6:
5820                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5821                                 PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5822
5823                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5824                                 PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5825 #endif /* INET6 */
5826                 }
5827         }
5828
5829         pfsync_update_state(*state);
5830         lockmgr(&(*state)->lk, LK_RELEASE);
5831         return (PF_PASS);
5832 }
5833
5834 /*
5835  * ipoff and off are measured from the start of the mbuf chain.
5836  * h must be at "ipoff" on the mbuf chain.
5837  */
5838 void *
5839 pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5840     u_short *actionp, u_short *reasonp, sa_family_t af)
5841 {
5842         switch (af) {
5843 #ifdef INET
5844         case AF_INET: {
5845                 struct ip       *h = mtod(m, struct ip *);
5846                 u_int16_t        fragoff = (h->ip_off & IP_OFFMASK) << 3;
5847
5848                 if (fragoff) {
5849                         if (fragoff >= len)
5850                                 ACTION_SET(actionp, PF_PASS);
5851                         else {
5852                                 ACTION_SET(actionp, PF_DROP);
5853                                 REASON_SET(reasonp, PFRES_FRAG);
5854                         }
5855                         return (NULL);
5856                 }
5857                 if (m->m_pkthdr.len < off + len ||
5858                     h->ip_len < off + len) {
5859                         ACTION_SET(actionp, PF_DROP);
5860                         REASON_SET(reasonp, PFRES_SHORT);
5861                         return (NULL);
5862                 }
5863                 break;
5864         }
5865 #endif /* INET */
5866 #ifdef INET6
5867         case AF_INET6: {
5868                 struct ip6_hdr  *h = mtod(m, struct ip6_hdr *);
5869
5870                 if (m->m_pkthdr.len < off + len ||
5871                     (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5872                     (unsigned)(off + len)) {
5873                         ACTION_SET(actionp, PF_DROP);
5874                         REASON_SET(reasonp, PFRES_SHORT);
5875                         return (NULL);
5876                 }
5877                 break;
5878         }
5879 #endif /* INET6 */
5880         }
5881         m_copydata(m, off, len, p);
5882         return (p);
5883 }
5884
5885 int
5886 pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif)
5887 {
5888         struct sockaddr_in      *dst;
5889         int                      ret = 1;
5890         int                      check_mpath;
5891 #ifdef INET6
5892         struct sockaddr_in6     *dst6;
5893         struct route_in6         ro;
5894 #else
5895         struct route             ro;
5896 #endif
5897         struct radix_node       *rn;
5898         struct rtentry          *rt;
5899         struct ifnet            *ifp;
5900
5901         check_mpath = 0;
5902         bzero(&ro, sizeof(ro));
5903         switch (af) {
5904         case AF_INET:
5905                 dst = satosin(&ro.ro_dst);
5906                 dst->sin_family = AF_INET;
5907                 dst->sin_len = sizeof(*dst);
5908                 dst->sin_addr = addr->v4;
5909                 break;
5910 #ifdef INET6
5911         case AF_INET6:
5912                 /*
5913                  * Skip check for addresses with embedded interface scope,
5914                  * as they would always match anyway.
5915                  */
5916                 if (IN6_IS_SCOPE_EMBED(&addr->v6))
5917                         goto out;
5918                 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5919                 dst6->sin6_family = AF_INET6;
5920                 dst6->sin6_len = sizeof(*dst6);
5921                 dst6->sin6_addr = addr->v6;
5922                 break;
5923 #endif /* INET6 */
5924         default:
5925                 return (0);
5926         }
5927
5928         /* Skip checks for ipsec interfaces */
5929         if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5930                 goto out;
5931
5932         rtalloc_ign((struct route *)&ro, 0);
5933
5934         if (ro.ro_rt != NULL) {
5935                 /* No interface given, this is a no-route check */
5936                 if (kif == NULL)
5937                         goto out;
5938
5939                 if (kif->pfik_ifp == NULL) {
5940                         ret = 0;
5941                         goto out;
5942                 }
5943
5944                 /* Perform uRPF check if passed input interface */
5945                 ret = 0;
5946                 rn = (struct radix_node *)ro.ro_rt;
5947                 do {
5948                         rt = (struct rtentry *)rn;
5949                         ifp = rt->rt_ifp;
5950
5951                         if (kif->pfik_ifp == ifp)
5952                                 ret = 1;
5953                         rn = NULL;
5954                 } while (check_mpath == 1 && rn != NULL && ret == 0);
5955         } else
5956                 ret = 0;
5957 out:
5958         if (ro.ro_rt != NULL)
5959                 RTFREE(ro.ro_rt);
5960         return (ret);
5961 }
5962
5963 int
5964 pf_rtlabel_match(struct pf_addr *addr, sa_family_t af, struct pf_addr_wrap *aw)
5965 {
5966         struct sockaddr_in      *dst;
5967 #ifdef INET6
5968         struct sockaddr_in6     *dst6;
5969         struct route_in6         ro;
5970 #else
5971         struct route             ro;
5972 #endif
5973         int                      ret = 0;
5974
5975         ASSERT_LWKT_TOKEN_HELD(&pf_token);
5976
5977         bzero(&ro, sizeof(ro));
5978         switch (af) {
5979         case AF_INET:
5980                 dst = satosin(&ro.ro_dst);
5981                 dst->sin_family = AF_INET;
5982                 dst->sin_len = sizeof(*dst);
5983                 dst->sin_addr = addr->v4;
5984                 break;
5985 #ifdef INET6
5986         case AF_INET6:
5987                 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5988                 dst6->sin6_family = AF_INET6;
5989                 dst6->sin6_len = sizeof(*dst6);
5990                 dst6->sin6_addr = addr->v6;
5991                 break;
5992 #endif /* INET6 */
5993         default:
5994                 return (0);
5995         }
5996
5997 rtalloc_ign((struct route *)&ro, (RTF_CLONING | RTF_PRCLONING));
5998
5999         if (ro.ro_rt != NULL) {
6000                 RTFREE(ro.ro_rt);
6001         }
6002
6003         return (ret);
6004 }
6005
6006 #ifdef INET
6007 void
6008 pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
6009     struct pf_state *s, struct pf_pdesc *pd)
6010 {
6011         struct mbuf             *m0, *m1;
6012         struct route             iproute;
6013         struct route            *ro = NULL;
6014         struct sockaddr_in      *dst;
6015         struct ip               *ip;
6016         struct ifnet            *ifp = NULL;
6017         struct pf_addr           naddr;
6018         struct pf_src_node      *sn = NULL;
6019         int                      error = 0;
6020         int sw_csum;
6021
6022         ASSERT_LWKT_TOKEN_HELD(&pf_token);
6023
6024         if (m == NULL || *m == NULL || r == NULL ||
6025             (dir != PF_IN && dir != PF_OUT) || oifp == NULL)
6026                 panic("pf_route: invalid parameters");
6027
6028         if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
6029                 (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED;
6030                 (*m)->m_pkthdr.pf.routed = 1;
6031         } else {
6032                 if ((*m)->m_pkthdr.pf.routed++ > 3) {
6033                         m0 = *m;
6034                         *m = NULL;
6035                         goto bad;
6036                 }
6037         }
6038
6039         if (r->rt == PF_DUPTO) {
6040                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
6041                         return;
6042                 }
6043         } else {
6044                 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
6045                         return;
6046                 }
6047                 m0 = *m;
6048         }
6049
6050         if (m0->m_len < sizeof(struct ip)) {
6051                 DPFPRINTF(PF_DEBUG_URGENT,
6052                     ("pf_route: m0->m_len < sizeof(struct ip)\n"));
6053                 goto bad;
6054         }
6055
6056         ip = mtod(m0, struct ip *);
6057
6058         ro = &iproute;
6059         bzero((caddr_t)ro, sizeof(*ro));
6060         dst = satosin(&ro->ro_dst);
6061         dst->sin_family = AF_INET;
6062         dst->sin_len = sizeof(*dst);
6063         dst->sin_addr = ip->ip_dst;
6064
6065         if (r->rt == PF_FASTROUTE) {
6066                 rtalloc(ro);
6067                 if (ro->ro_rt == 0) {
6068                         ipstat.ips_noroute++;
6069                         goto bad;
6070                 }
6071
6072                 ifp = ro->ro_rt->rt_ifp;
6073                 ro->ro_rt->rt_use++;
6074
6075                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
6076                         dst = satosin(ro->ro_rt->rt_gateway);
6077         } else {
6078                 if (TAILQ_EMPTY(&r->rpool.list)) {
6079                         DPFPRINTF(PF_DEBUG_URGENT,
6080                             ("pf_route: TAILQ_EMPTY(&r->rpool.list)\n"));
6081                         goto bad;
6082                 }
6083                 if (s == NULL) {
6084                         pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
6085                             &naddr, NULL, &sn);
6086                         if (!PF_AZERO(&naddr, AF_INET))
6087                                 dst->sin_addr.s_addr = naddr.v4.s_addr;
6088                         ifp = r->rpool.cur->kif ?
6089                             r->rpool.cur->kif->pfik_ifp : NULL;
6090                 } else {
6091                         if (!PF_AZERO(&s->rt_addr, AF_INET))
6092                                 dst->sin_addr.s_addr =
6093                                     s->rt_addr.v4.s_addr;
6094                         ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
6095                 }
6096         }
6097         if (ifp == NULL)
6098                 goto bad;
6099
6100         if (oifp != ifp) {
6101                 if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
6102                         goto bad;
6103                 } else if (m0 == NULL) {
6104                         goto done;
6105                 }
6106                 if (m0->m_len < sizeof(struct ip)) {
6107                         DPFPRINTF(PF_DEBUG_URGENT,
6108                             ("pf_route: m0->m_len < sizeof(struct ip)\n"));
6109                         goto bad;
6110                 }
6111                 ip = mtod(m0, struct ip *);
6112         }
6113
6114         /* Copied from FreeBSD 5.1-CURRENT ip_output. */
6115         m0->m_pkthdr.csum_flags |= CSUM_IP;
6116         sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
6117         if (sw_csum & CSUM_DELAY_DATA) {
6118                 in_delayed_cksum(m0);
6119                 sw_csum &= ~CSUM_DELAY_DATA;
6120         }
6121         m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
6122         m0->m_pkthdr.csum_iphlen = (ip->ip_hl << 2);
6123
6124         /*
6125          * WARNING!  We cannot fragment if the packet was modified from an
6126          *           original which expected to be using TSO.  In this
6127          *           situation we pray that the target interface is
6128          *           compatible with the originating interface.
6129          */
6130         if (ip->ip_len <= ifp->if_mtu ||
6131             (m0->m_pkthdr.csum_flags & CSUM_TSO) ||
6132             ((ifp->if_hwassist & CSUM_FRAGMENT) &&
6133                 (ip->ip_off & IP_DF) == 0)) {
6134                 ip->ip_len = htons(ip->ip_len);
6135                 ip->ip_off = htons(ip->ip_off);
6136                 ip->ip_sum = 0;
6137                 if (sw_csum & CSUM_DELAY_IP) {
6138                         /* From KAME */
6139                         if (ip->ip_v == IPVERSION &&
6140                             (ip->ip_hl << 2) == sizeof(*ip)) {
6141                                 ip->ip_sum = in_cksum_hdr(ip);
6142                         } else {
6143                                 ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
6144                         }
6145                 }
6146                 lwkt_reltoken(&pf_token);
6147                 error = ifp->if_output(ifp, m0, sintosa(dst), ro->ro_rt);
6148                 lwkt_gettoken(&pf_token);
6149                 goto done;
6150         }
6151
6152         /*
6153          * Too large for interface; fragment if possible.
6154          * Must be able to put at least 8 bytes per fragment.
6155          */
6156         if (ip->ip_off & IP_DF) {
6157                 ipstat.ips_cantfrag++;
6158                 if (r->rt != PF_DUPTO) {
6159                         icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
6160                                    ifp->if_mtu);
6161                         goto done;
6162                 } else
6163                         goto bad;
6164         }
6165
6166         m1 = m0;
6167         error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
6168         if (error) {
6169                 goto bad;
6170         }
6171
6172         for (m0 = m1; m0; m0 = m1) {
6173                 m1 = m0->m_nextpkt;
6174                 m0->m_nextpkt = 0;
6175                 if (error == 0) {
6176                         lwkt_reltoken(&pf_token);
6177                         error = (*ifp->if_output)(ifp, m0, sintosa(dst),
6178                                                   NULL);
6179                         lwkt_gettoken(&pf_token);
6180                 } else
6181                         m_freem(m0);
6182         }
6183
6184         if (error == 0)
6185                 ipstat.ips_fragmented++;
6186
6187 done:
6188         if (r->rt != PF_DUPTO)
6189                 *m = NULL;
6190         if (ro == &iproute && ro->ro_rt)
6191                 RTFREE(ro->ro_rt);
6192         return;
6193
6194 bad:
6195         m_freem(m0);
6196         goto done;
6197 }
6198 #endif /* INET */
6199
6200 #ifdef INET6
6201 void
6202 pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
6203     struct pf_state *s, struct pf_pdesc *pd)
6204 {
6205         struct mbuf             *m0;
6206         struct route_in6         ip6route;
6207         struct route_in6        *ro;
6208         struct sockaddr_in6     *dst;
6209         struct ip6_hdr          *ip6;
6210         struct ifnet            *ifp = NULL;
6211         struct pf_addr           naddr;
6212         struct pf_src_node      *sn = NULL;
6213
6214         if (m == NULL || *m == NULL || r == NULL ||
6215             (dir != PF_IN && dir != PF_OUT) || oifp == NULL)
6216                 panic("pf_route6: invalid parameters");
6217
6218         if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
6219                 (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED;
6220                 (*m)->m_pkthdr.pf.routed = 1;
6221         } else {
6222                 if ((*m)->m_pkthdr.pf.routed++ > 3) {
6223                         m0 = *m;
6224                         *m = NULL;
6225                         goto bad;
6226                 }
6227         }
6228
6229         if (r->rt == PF_DUPTO) {
6230                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL)
6231                         return;
6232         } else {
6233                 if ((r->rt == PF_REPLYTO) == (r->direction == dir))
6234                         return;
6235                 m0 = *m;
6236         }
6237
6238         if (m0->m_len < sizeof(struct ip6_hdr)) {
6239                 DPFPRINTF(PF_DEBUG_URGENT,
6240                     ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
6241                 goto bad;
6242         }
6243         ip6 = mtod(m0, struct ip6_hdr *);
6244
6245         ro = &ip6route;
6246         bzero((caddr_t)ro, sizeof(*ro));
6247         dst = (struct sockaddr_in6 *)&ro->ro_dst;
6248         dst->sin6_family = AF_INET6;
6249         dst->sin6_len = sizeof(*dst);
6250         dst->sin6_addr = ip6->ip6_dst;
6251
6252         /*
6253          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6254          * so make sure pf.flags is clear.
6255          *
6256          * Cheat. XXX why only in the v6 case???
6257          */
6258         if (r->rt == PF_FASTROUTE) {
6259                 m0->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
6260                 m0->m_pkthdr.pf.flags = 0;
6261                 /* XXX Re-Check when Upgrading to > 4.4 */
6262                 m0->m_pkthdr.pf.statekey = NULL;
6263                 ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
6264                 return;
6265         }
6266
6267         if (TAILQ_EMPTY(&r->rpool.list)) {
6268                 DPFPRINTF(PF_DEBUG_URGENT,
6269                     ("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n"));
6270                 goto bad;
6271         }
6272         if (s == NULL) {
6273                 pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
6274                     &naddr, NULL, &sn);
6275                 if (!PF_AZERO(&naddr, AF_INET6))
6276                         PF_ACPY((struct pf_addr *)&dst->sin6_addr,
6277                             &naddr, AF_INET6);
6278                 ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
6279         } else {
6280                 if (!PF_AZERO(&s->rt_addr, AF_INET6))
6281                         PF_ACPY((struct pf_addr *)&dst->sin6_addr,
6282                             &s->rt_addr, AF_INET6);
6283                 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
6284         }
6285         if (ifp == NULL)
6286                 goto bad;
6287
6288         if (oifp != ifp) {
6289                 if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
6290                         goto bad;
6291                 } else if (m0 == NULL) {
6292                         goto done;
6293                 }
6294                 if (m0->m_len < sizeof(struct ip6_hdr)) {
6295                         DPFPRINTF(PF_DEBUG_URGENT,
6296                             ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
6297                         goto bad;
6298                 }
6299                 ip6 = mtod(m0, struct ip6_hdr *);
6300         }
6301
6302         /*
6303          * If the packet is too large for the outgoing interface,
6304          * send back an icmp6 error.
6305          */
6306         if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr))
6307                 dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index);
6308         if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
6309                 nd6_output(ifp, ifp, m0, dst, NULL);
6310         } else {
6311                 in6_ifstat_inc(ifp, ifs6_in_toobig);
6312                 if (r->rt != PF_DUPTO)
6313                         icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
6314                 else
6315                         goto bad;
6316         }
6317
6318 done:
6319         if (r->rt != PF_DUPTO)
6320                 *m = NULL;
6321         return;
6322
6323 bad:
6324         m_freem(m0);
6325         goto done;
6326 }
6327 #endif /* INET6 */
6328
6329
6330 /*
6331  * check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag
6332  *   off is the offset where the protocol header starts
6333  *   len is the total length of protocol header plus payload
6334  * returns 0 when the checksum is valid, otherwise returns 1.
6335  */
6336 /*
6337  * XXX
6338  * FreeBSD supports cksum offload for the following drivers.
6339  * em(4), gx(4), lge(4), nge(4), ti(4), xl(4)
6340  * If we can make full use of it we would outperform ipfw/ipfilter in
6341  * very heavy traffic.
6342  * I have not tested 'cause I don't have NICs that supports cksum offload.
6343  * (There might be problems. Typical phenomena would be
6344  *   1. No route message for UDP packet.
6345  *   2. No connection acceptance from external hosts regardless of rule set.)
6346  */
6347 int
6348 pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p,
6349     sa_family_t af)
6350 {
6351         u_int16_t sum = 0;
6352         int hw_assist = 0;
6353         struct ip *ip;
6354
6355         if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
6356                 return (1);
6357         if (m->m_pkthdr.len < off + len)
6358                 return (1);
6359
6360         switch (p) {
6361         case IPPROTO_TCP:
6362         case IPPROTO_UDP:
6363                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
6364                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6365                                 sum = m->m_pkthdr.csum_data;
6366                         } else {
6367                                 ip = mtod(m, struct ip *);
6368                                 sum = in_pseudo(ip->ip_src.s_addr,
6369                                         ip->ip_dst.s_addr, htonl((u_short)len +
6370                                         m->m_pkthdr.csum_data + p));
6371                         }
6372                         sum ^= 0xffff;
6373                         ++hw_assist;
6374                 }
6375                 break;
6376         case IPPROTO_ICMP:
6377 #ifdef INET6
6378         case IPPROTO_ICMPV6:
6379 #endif /* INET6 */
6380                 break;
6381         default:
6382                 return (1);
6383         }
6384
6385         if (!hw_assist) {
6386                 switch (af) {
6387                 case AF_INET:
6388                         if (p == IPPROTO_ICMP) {
6389                                 if (m->m_len < off)
6390                                         return (1);
6391                                 m->m_data += off;
6392                                 m->m_len -= off;
6393                                 sum = in_cksum(m, len);
6394                                 m->m_data -= off;
6395                                 m->m_len += off;
6396                         } else {
6397                                 if (m->m_len < sizeof(struct ip))
6398                                         return (1);
6399                                 sum = in_cksum_range(m, p, off, len);
6400                                 if (sum == 0) {
6401                                         m->m_pkthdr.csum_flags |=
6402                                             (CSUM_DATA_VALID |
6403                                              CSUM_PSEUDO_HDR);
6404                                         m->m_pkthdr.csum_data = 0xffff;
6405                                 }
6406                         }
6407                         break;
6408 #ifdef INET6
6409                 case AF_INET6:
6410                         if (m->m_len < sizeof(struct ip6_hdr))
6411                                 return (1);
6412                         sum = in6_cksum(m, p, off, len);
6413                         /*
6414                          * XXX
6415                          * IPv6 H/W cksum off-load not supported yet!
6416                          *
6417                          * if (sum == 0) {
6418                          *      m->m_pkthdr.csum_flags |=
6419                          *          (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
6420                          *      m->m_pkthdr.csum_data = 0xffff;
6421                          *}
6422                          */
6423                         break;
6424 #endif /* INET6 */
6425                 default:
6426                         return (1);
6427                 }
6428         }
6429         if (sum) {
6430                 switch (p) {
6431                 case IPPROTO_TCP:
6432                         tcpstat.tcps_rcvbadsum++;
6433                         break;
6434                 case IPPROTO_UDP:
6435                         udp_stat.udps_badsum++;
6436                         break;
6437                 case IPPROTO_ICMP:
6438                         icmpstat.icps_checksum++;
6439                         break;
6440 #ifdef INET6
6441                 case IPPROTO_ICMPV6:
6442                         icmp6stat.icp6s_checksum++;
6443                         break;
6444 #endif /* INET6 */
6445                 }
6446                 return (1);
6447         }
6448         return (0);
6449 }
6450
6451 struct pf_divert *
6452 pf_find_divert(struct mbuf *m)
6453 {
6454         struct m_tag    *mtag;
6455
6456         if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL)
6457                 return (NULL);
6458
6459         return ((struct pf_divert *)(mtag + 1));
6460 }
6461
6462 struct pf_divert *
6463 pf_get_divert(struct mbuf *m)
6464 {
6465         struct m_tag    *mtag;
6466
6467         if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL) {
6468                 mtag = m_tag_get(PACKET_TAG_PF_DIVERT, sizeof(struct pf_divert),
6469                     M_NOWAIT);
6470                 if (mtag == NULL)
6471                         return (NULL);
6472                 bzero(mtag + 1, sizeof(struct pf_divert));
6473                 m_tag_prepend(m, mtag);
6474         }
6475
6476         return ((struct pf_divert *)(mtag + 1));
6477 }
6478
6479 #ifdef INET
6480
6481 /*
6482  * WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
6483  */
6484 int
6485 pf_test(int dir, struct ifnet *ifp, struct mbuf **m0,
6486     struct ether_header *eh, struct inpcb *inp)
6487 {
6488         struct pfi_kif          *kif;
6489         u_short                  action, reason = 0, log = 0;
6490         struct mbuf             *m = *m0;
6491         struct ip               *h = NULL;
6492         struct pf_rule          *a = NULL, *r = &pf_default_rule, *tr, *nr;
6493         struct pf_state         *s = NULL;
6494         struct pf_ruleset       *ruleset = NULL;
6495         struct pf_pdesc          pd;
6496         int                      off, dirndx;
6497 #ifdef ALTQ
6498         int                      pqid = 0;
6499 #endif
6500
6501         if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6502                 /* Skip us; continue in ipfw. */
6503                 return (PF_PASS);
6504         }
6505
6506         if (!pf_status.running)
6507                 return (PF_PASS);
6508
6509         memset(&pd, 0, sizeof(pd));
6510 #ifdef foo
6511         if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
6512                 kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
6513         else
6514 #endif
6515                 kif = (struct pfi_kif *)ifp->if_pf_kif;
6516
6517         if (kif == NULL) {
6518                 DPFPRINTF(PF_DEBUG_URGENT,
6519                     ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
6520                 return (PF_DROP);
6521         }
6522         if (kif->pfik_flags & PFI_IFLAG_SKIP)
6523                 return (PF_PASS);
6524
6525 #ifdef DIAGNOSTIC
6526         if ((m->m_flags & M_PKTHDR) == 0)
6527                 panic("non-M_PKTHDR is passed to pf_test");
6528 #endif /* DIAGNOSTIC */
6529
6530         if (m->m_pkthdr.len < (int)sizeof(*h)) {
6531                 action = PF_DROP;
6532                 REASON_SET(&reason, PFRES_SHORT);
6533                 log = 1;
6534                 goto done;
6535         }
6536
6537         /*
6538          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6539          * so make sure pf.flags is clear.
6540          */
6541         if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
6542                 return (PF_PASS);
6543         m->m_pkthdr.pf.flags = 0;
6544         /* Re-Check when updating to > 4.4 */
6545         m->m_pkthdr.pf.statekey = NULL;
6546
6547         /* We do IP header normalization and packet reassembly here */
6548         if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
6549                 action = PF_DROP;
6550                 goto done;
6551         }
6552         m = *m0;        /* pf_normalize messes with m0 */
6553         h = mtod(m, struct ip *);
6554
6555         off = h->ip_hl << 2;
6556         if (off < (int)sizeof(*h)) {
6557                 action = PF_DROP;
6558                 REASON_SET(&reason, PFRES_SHORT);
6559                 log = 1;
6560                 goto done;
6561         }
6562
6563         pd.src = (struct pf_addr *)&h->ip_src;
6564         pd.dst = (struct pf_addr *)&h->ip_dst;
6565         pd.sport = pd.dport = NULL;
6566         pd.ip_sum = &h->ip_sum;
6567         pd.proto_sum = NULL;
6568         pd.proto = h->ip_p;
6569         pd.dir = dir;
6570         pd.sidx = (dir == PF_IN) ? 0 : 1;
6571         pd.didx = (dir == PF_IN) ? 1 : 0;
6572         pd.af = AF_INET;
6573         pd.tos = h->ip_tos;
6574         pd.tot_len = h->ip_len;
6575         pd.eh = eh;
6576
6577         /* handle fragments that didn't get reassembled by normalization */
6578         if (h->ip_off & (IP_MF | IP_OFFMASK)) {
6579                 action = pf_test_fragment(&r, dir, kif, m, h,
6580                     &pd, &a, &ruleset);
6581                 goto done;
6582         }
6583
6584         switch (h->ip_p) {
6585
6586         case IPPROTO_TCP: {
6587                 struct tcphdr   th;
6588
6589                 pd.hdr.tcp = &th;
6590                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6591                     &action, &reason, AF_INET)) {
6592                         log = action != PF_PASS;
6593                         goto done;
6594                 }
6595                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6596 #ifdef ALTQ
6597                 if ((th.th_flags & TH_ACK) && pd.p_len == 0)
6598                         pqid = 1;
6599 #endif
6600                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6601                 if (action == PF_DROP)
6602                         goto done;
6603                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6604                                            &reason);
6605                 if (action == PF_PASS) {
6606                         r = s->rule.ptr;
6607                         a = s->anchor.ptr;
6608                         log = s->log;
6609                 } else if (s == NULL) {
6610                         action = pf_test_rule(&r, &s, dir, kif,
6611                                               m, off, h, &pd, &a,
6612                                               &ruleset, NULL, inp);
6613                 }
6614                 break;
6615         }
6616
6617         case IPPROTO_UDP: {
6618                 struct udphdr   uh;
6619
6620                 pd.hdr.udp = &uh;
6621                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6622                     &action, &reason, AF_INET)) {
6623                         log = action != PF_PASS;
6624                         goto done;
6625                 }
6626                 if (uh.uh_dport == 0 ||
6627                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6628                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6629                         action = PF_DROP;
6630                         REASON_SET(&reason, PFRES_SHORT);
6631                         goto done;
6632                 }
6633                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6634                 if (action == PF_PASS) {
6635                         r = s->rule.ptr;
6636                         a = s->anchor.ptr;
6637                         log = s->log;
6638                 } else if (s == NULL) {
6639                         action = pf_test_rule(&r, &s, dir, kif,
6640                                               m, off, h, &pd, &a,
6641                                               &ruleset, NULL, inp);
6642                 }
6643                 break;
6644         }
6645
6646         case IPPROTO_ICMP: {
6647                 struct icmp     ih;
6648
6649                 pd.hdr.icmp = &ih;
6650                 if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
6651                     &action, &reason, AF_INET)) {
6652                         log = action != PF_PASS;
6653                         goto done;
6654                 }
6655                 action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
6656                                             &reason);
6657                 if (action == PF_PASS) {
6658                         r = s->rule.ptr;
6659                         a = s->anchor.ptr;
6660                         log = s->log;
6661                 } else if (s == NULL) {
6662                         action = pf_test_rule(&r, &s, dir, kif,
6663                                               m, off, h, &pd, &a,
6664                                               &ruleset, NULL, inp);
6665                 }
6666                 break;
6667         }
6668
6669         default:
6670                 action = pf_test_state_other(&s, dir, kif, m, &pd);
6671                 if (action == PF_PASS) {
6672                         r = s->rule.ptr;
6673                         a = s->anchor.ptr;
6674                         log = s->log;
6675                 } else if (s == NULL) {
6676                         action = pf_test_rule(&r, &s, dir, kif, m, off, h,
6677                                               &pd, &a, &ruleset, NULL, inp);
6678                 }
6679                 break;
6680         }
6681
6682 done:
6683         if (action == PF_PASS && h->ip_hl > 5 &&
6684             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6685                 action = PF_DROP;
6686                 REASON_SET(&reason, PFRES_IPOPTIONS);
6687                 log = 1;
6688                 DPFPRINTF(PF_DEBUG_MISC,
6689                     ("pf: dropping packet with ip options\n"));
6690         }
6691
6692         if ((s && s->tag) || r->rtableid)
6693                 pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
6694
6695 #if 0
6696         if (dir == PF_IN && s && s->key[PF_SK_STACK])
6697                 m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
6698 #endif
6699
6700 #ifdef ALTQ
6701         /*
6702          * Generate a hash code and qid request for ALTQ.  A qid of 0
6703          * is allowed and will cause altq to select the default queue.
6704          */
6705         if (action == PF_PASS) {
6706                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
6707                 if (pqid || (pd.tos & IPTOS_LOWDELAY))
6708                         m->m_pkthdr.pf.qid = r->pqid;
6709                 else
6710                         m->m_pkthdr.pf.qid = r->qid;
6711                 m->m_pkthdr.pf.ecn_af = AF_INET;
6712                 m->m_pkthdr.pf.hdr = h;
6713                 /* add connection hash for fairq */
6714                 if (s) {
6715                         /* for fairq */
6716                         m->m_pkthdr.pf.state_hash = s->hash;
6717                         m->m_pkthdr.pf.flags |= PF_TAG_STATE_HASHED;
6718                 }
6719         }
6720 #endif /* ALTQ */
6721
6722         /*
6723          * connections redirected to loopback should not match sockets
6724          * bound specifically to loopback due to security implications,
6725          * see tcp_input() and in_pcblookup_listen().
6726          */
6727         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6728             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6729             (s->nat_rule.ptr->action == PF_RDR ||
6730             s->nat_rule.ptr->action == PF_BINAT) &&
6731             (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
6732                 m->m_pkthdr.pf.flags |= PF_TAG_TRANSLATE_LOCALHOST;
6733
6734         if (dir == PF_IN && action == PF_PASS && r->divert.port) {
6735                 struct pf_divert *divert;
6736
6737                 if ((divert = pf_get_divert(m))) {
6738                         m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED;
6739                         divert->port = r->divert.port;
6740                         divert->addr.ipv4 = r->divert.addr.v4;
6741                 }
6742         }
6743
6744         if (log) {
6745                 struct pf_rule *lr;
6746
6747                 if (s != NULL && s->nat_rule.ptr != NULL &&
6748                     s->nat_rule.ptr->log & PF_LOG_ALL)
6749                         lr = s->nat_rule.ptr;
6750                 else
6751                         lr = r;
6752                 PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset,
6753                     &pd);
6754         }
6755
6756         kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6757         kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
6758
6759         if (action == PF_PASS || r->action == PF_DROP) {
6760                 dirndx = (dir == PF_OUT);
6761                 r->packets[dirndx]++;
6762                 r->bytes[dirndx] += pd.tot_len;
6763                 if (a != NULL) {
6764                         a->packets[dirndx]++;
6765                         a->bytes[dirndx] += pd.tot_len;
6766                 }
6767                 if (s != NULL) {
6768                         if (s->nat_rule.ptr != NULL) {
6769                                 s->nat_rule.ptr->packets[dirndx]++;
6770                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6771                         }
6772                         if (s->src_node != NULL) {
6773                                 s->src_node->packets[dirndx]++;
6774                                 s->src_node->bytes[dirndx] += pd.tot_len;
6775                         }
6776                         if (s->nat_src_node != NULL) {
6777                                 s->nat_src_node->packets[dirndx]++;
6778                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
6779                         }
6780                         dirndx = (dir == s->direction) ? 0 : 1;
6781                         s->packets[dirndx]++;
6782                         s->bytes[dirndx] += pd.tot_len;
6783                 }
6784                 tr = r;
6785                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6786                 if (nr != NULL && r == &pf_default_rule)
6787                         tr = nr;
6788                 if (tr->src.addr.type == PF_ADDR_TABLE)
6789                         pfr_update_stats(tr->src.addr.p.tbl,
6790                             (s == NULL) ? pd.src :
6791                             &s->key[(s->direction == PF_IN)]->
6792                                 addr[(s->direction == PF_OUT)],
6793                             pd.af, pd.tot_len, dir == PF_OUT,
6794                             r->action == PF_PASS, tr->src.neg);
6795                 if (tr->dst.addr.type == PF_ADDR_TABLE)
6796                         pfr_update_stats(tr->dst.addr.p.tbl,
6797                             (s == NULL) ? pd.dst :
6798                             &s->key[(s->direction == PF_IN)]->
6799                                 addr[(s->direction == PF_IN)],
6800                             pd.af, pd.tot_len, dir == PF_OUT,
6801                             r->action == PF_PASS, tr->dst.neg);
6802         }
6803
6804
6805         if (action == PF_SYNPROXY_DROP) {
6806                 m_freem(*m0);
6807                 *m0 = NULL;
6808                 action = PF_PASS;
6809         } else if (r->rt) {
6810                 /* pf_route can free the mbuf causing *m0 to become NULL */
6811                 pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
6812         }
6813
6814         return (action);
6815 }
6816 #endif /* INET */
6817
6818 #ifdef INET6
6819
6820 /*
6821  * WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
6822  */
6823 int
6824 pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0,
6825     struct ether_header *eh, struct inpcb *inp)
6826 {
6827         struct pfi_kif          *kif;
6828         u_short                  action, reason = 0, log = 0;
6829         struct mbuf             *m = *m0, *n = NULL;
6830         struct ip6_hdr          *h = NULL;
6831         struct pf_rule          *a = NULL, *r = &pf_default_rule, *tr, *nr;
6832         struct pf_state         *s = NULL;
6833         struct pf_ruleset       *ruleset = NULL;
6834         struct pf_pdesc          pd;
6835         int                      off, terminal = 0, dirndx, rh_cnt = 0;
6836
6837         if (!pf_status.running)
6838                 return (PF_PASS);
6839
6840         memset(&pd, 0, sizeof(pd));
6841 #ifdef foo
6842         if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
6843                 kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
6844         else
6845 #endif
6846                 kif = (struct pfi_kif *)ifp->if_pf_kif;
6847
6848         if (kif == NULL) {
6849                 DPFPRINTF(PF_DEBUG_URGENT,
6850                     ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6851                 return (PF_DROP);
6852         }
6853         if (kif->pfik_flags & PFI_IFLAG_SKIP)
6854                 return (PF_PASS);
6855
6856 #ifdef DIAGNOSTIC
6857         if ((m->m_flags & M_PKTHDR) == 0)
6858                 panic("non-M_PKTHDR is passed to pf_test6");
6859 #endif /* DIAGNOSTIC */
6860
6861         if (m->m_pkthdr.len < (int)sizeof(*h)) {
6862                 action = PF_DROP;
6863                 REASON_SET(&reason, PFRES_SHORT);
6864                 log = 1;
6865                 goto done;
6866         }
6867
6868         /*
6869          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6870          * so make sure pf.flags is clear.
6871          */
6872         if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
6873                 return (PF_PASS);
6874         m->m_pkthdr.pf.flags = 0;
6875         /* Re-Check when updating to > 4.4 */
6876         m->m_pkthdr.pf.statekey = NULL;
6877
6878         /* We do IP header normalization and packet reassembly here */
6879         if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6880                 action = PF_DROP;
6881                 goto done;
6882         }
6883         m = *m0;        /* pf_normalize messes with m0 */
6884         h = mtod(m, struct ip6_hdr *);
6885
6886 #if 1
6887         /*
6888          * we do not support jumbogram yet.  if we keep going, zero ip6_plen
6889          * will do something bad, so drop the packet for now.
6890          */
6891         if (htons(h->ip6_plen) == 0) {
6892                 action = PF_DROP;
6893                 REASON_SET(&reason, PFRES_NORM);        /*XXX*/
6894                 goto done;
6895         }
6896 #endif
6897
6898         pd.src = (struct pf_addr *)&h->ip6_src;
6899         pd.dst = (struct pf_addr *)&h->ip6_dst;
6900         pd.sport = pd.dport = NULL;
6901         pd.ip_sum = NULL;
6902         pd.proto_sum = NULL;
6903         pd.dir = dir;
6904         pd.sidx = (dir == PF_IN) ? 0 : 1;
6905         pd.didx = (dir == PF_IN) ? 1 : 0;
6906         pd.af = AF_INET6;
6907         pd.tos = 0;
6908         pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6909         pd.eh = eh;
6910
6911         off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6912         pd.proto = h->ip6_nxt;
6913         do {
6914                 switch (pd.proto) {
6915                 case IPPROTO_FRAGMENT:
6916                         action = pf_test_fragment(&r, dir, kif, m, h,
6917                             &pd, &a, &ruleset);
6918                         if (action == PF_DROP)
6919                                 REASON_SET(&reason, PFRES_FRAG);
6920                         goto done;
6921                 case IPPROTO_ROUTING: {
6922                         struct ip6_rthdr rthdr;
6923
6924                         if (rh_cnt++) {
6925                                 DPFPRINTF(PF_DEBUG_MISC,
6926                                     ("pf: IPv6 more than one rthdr\n"));
6927                                 action = PF_DROP;
6928                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6929                                 log = 1;
6930                                 goto done;
6931                         }
6932                         if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6933                             &reason, pd.af)) {
6934                                 DPFPRINTF(PF_DEBUG_MISC,
6935                                     ("pf: IPv6 short rthdr\n"));
6936                                 action = PF_DROP;
6937                                 REASON_SET(&reason, PFRES_SHORT);
6938                                 log = 1;
6939                                 goto done;
6940                         }
6941                         if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6942                                 DPFPRINTF(PF_DEBUG_MISC,
6943                                     ("pf: IPv6 rthdr0\n"));
6944                                 action = PF_DROP;
6945                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6946                                 log = 1;
6947                                 goto done;
6948                         }
6949                         /* FALLTHROUGH */
6950                 }
6951                 case IPPROTO_AH:
6952                 case IPPROTO_HOPOPTS:
6953                 case IPPROTO_DSTOPTS: {
6954                         /* get next header and header length */
6955                         struct ip6_ext  opt6;
6956
6957                         if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6958                             NULL, &reason, pd.af)) {
6959                                 DPFPRINTF(PF_DEBUG_MISC,
6960                                     ("pf: IPv6 short opt\n"));
6961                                 action = PF_DROP;
6962                                 log = 1;
6963                                 goto done;
6964                         }
6965                         if (pd.proto == IPPROTO_AH)
6966                                 off += (opt6.ip6e_len + 2) * 4;
6967                         else
6968                                 off += (opt6.ip6e_len + 1) * 8;
6969                         pd.proto = opt6.ip6e_nxt;
6970                         /* goto the next header */
6971                         break;
6972                 }
6973                 default:
6974                         terminal++;
6975                         break;
6976                 }
6977         } while (!terminal);
6978
6979         /* if there's no routing header, use unmodified mbuf for checksumming */
6980         if (!n)
6981                 n = m;
6982
6983         switch (pd.proto) {
6984
6985         case IPPROTO_TCP: {
6986                 struct tcphdr   th;
6987
6988                 pd.hdr.tcp = &th;
6989                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6990                     &action, &reason, AF_INET6)) {
6991                         log = action != PF_PASS;
6992                         goto done;
6993                 }
6994                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6995                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6996                 if (action == PF_DROP)
6997                         goto done;
6998                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6999                                            &reason);
7000                 if (action == PF_PASS) {
7001                         r = s->rule.ptr;
7002                         a = s->anchor.ptr;
7003                         log = s->log;
7004                 } else if (s == NULL) {
7005                         action = pf_test_rule(&r, &s, dir, kif,
7006                                               m, off, h, &pd, &a,
7007                                               &ruleset, NULL, inp);
7008                 }
7009                 break;
7010         }
7011
7012         case IPPROTO_UDP: {
7013                 struct udphdr   uh;
7014
7015                 pd.hdr.udp = &uh;
7016                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
7017                     &action, &reason, AF_INET6)) {
7018                         log = action != PF_PASS;
7019                         goto done;
7020                 }
7021                 if (uh.uh_dport == 0 ||
7022                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
7023                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
7024                         action = PF_DROP;
7025                         REASON_SET(&reason, PFRES_SHORT);
7026                         goto done;
7027                 }
7028                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
7029                 if (action == PF_PASS) {
7030                         r = s->rule.ptr;
7031                         a = s->anchor.ptr;
7032                         log = s->log;
7033                 } else if (s == NULL) {
7034                         action = pf_test_rule(&r, &s, dir, kif,
7035                                               m, off, h, &pd, &a,
7036                                               &ruleset, NULL, inp);
7037                 }
7038                 break;
7039         }
7040
7041         case IPPROTO_ICMPV6: {
7042                 struct icmp6_hdr        ih;
7043
7044                 pd.hdr.icmp6 = &ih;
7045                 if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
7046                     &action, &reason, AF_INET6)) {
7047                         log = action != PF_PASS;
7048                         goto done;
7049                 }
7050                 action = pf_test_state_icmp(&s, dir, kif,
7051                                             m, off, h, &pd, &reason);
7052                 if (action == PF_PASS) {
7053                         r = s->rule.ptr;
7054                         a = s->anchor.ptr;
7055                         log = s->log;
7056                 } else if (s == NULL) {
7057                         action = pf_test_rule(&r, &s, dir, kif,
7058                                               m, off, h, &pd, &a,
7059                                               &ruleset, NULL, inp);
7060                 }
7061                 break;
7062         }
7063
7064         default:
7065                 action = pf_test_state_other(&s, dir, kif, m, &pd);
7066                 if (action == PF_PASS) {
7067                         r = s->rule.ptr;
7068                         a = s->anchor.ptr;
7069                         log = s->log;
7070                 } else if (s == NULL) {
7071                         action = pf_test_rule(&r, &s, dir, kif, m, off, h,
7072                                               &pd, &a, &ruleset, NULL, inp);
7073                 }
7074                 break;
7075         }
7076
7077 done:
7078         if (n != m) {
7079                 m_freem(n);
7080                 n = NULL;
7081         }
7082
7083         /* handle dangerous IPv6 extension headers. */
7084         if (action == PF_PASS && rh_cnt &&
7085             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
7086                 action = PF_DROP;
7087                 REASON_SET(&reason, PFRES_IPOPTIONS);
7088                 log = 1;
7089                 DPFPRINTF(PF_DEBUG_MISC,
7090                     ("pf: dropping packet with dangerous v6 headers\n"));
7091         }
7092
7093         if ((s && s->tag) || r->rtableid)
7094                 pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
7095
7096 #if 0
7097         if (dir == PF_IN && s && s->key[PF_SK_STACK])
7098                 m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
7099 #endif
7100
7101 #ifdef ALTQ
7102         /*
7103          * Generate a hash code and qid request for ALTQ.  A qid of 0
7104          * is allowed and will cause altq to select the default queue.
7105          */
7106         if (action == PF_PASS) {
7107                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
7108                 if (pd.tos & IPTOS_LOWDELAY)
7109                         m->m_pkthdr.pf.qid = r->pqid;
7110                 else
7111                         m->m_pkthdr.pf.qid = r->qid;
7112                 m->m_pkthdr.pf.ecn_af = AF_INET6;
7113                 m->m_pkthdr.pf.hdr = h;
7114                 if (s) {
7115                         /* for fairq */
7116                         m->m_pkthdr.pf.state_hash = s->hash;
7117                         m->m_pkthdr.pf.flags |= PF_TAG_STATE_HASHED;
7118                 }
7119         }
7120 #endif /* ALTQ */
7121
7122         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
7123             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
7124             (s->nat_rule.ptr->action == PF_RDR ||
7125             s->nat_rule.ptr->action == PF_BINAT) &&
7126             IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
7127                 m->m_pkthdr.pf.flags |= PF_TAG_TRANSLATE_LOCALHOST;
7128
7129         if (dir == PF_IN && action == PF_PASS && r->divert.port) {
7130                 struct pf_divert *divert;
7131
7132                 if ((divert = pf_get_divert(m))) {
7133                         m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED;
7134                         divert->port = r->divert.port;
7135                         divert->addr.ipv6 = r->divert.addr.v6;
7136                 }
7137         }
7138
7139         if (log) {
7140                 struct pf_rule *lr;
7141
7142                 if (s != NULL && s->nat_rule.ptr != NULL &&
7143                     s->nat_rule.ptr->log & PF_LOG_ALL)
7144                         lr = s->nat_rule.ptr;
7145                 else
7146                         lr = r;
7147                 PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset,
7148                     &pd);
7149         }
7150
7151         kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
7152         kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
7153
7154         if (action == PF_PASS || r->action == PF_DROP) {
7155                 dirndx = (dir == PF_OUT);
7156                 r->packets[dirndx]++;
7157                 r->bytes[dirndx] += pd.tot_len;
7158                 if (a != NULL) {
7159                         a->packets[dirndx]++;
7160                         a->bytes[dirndx] += pd.tot_len;
7161                 }
7162                 if (s != NULL) {
7163                         if (s->nat_rule.ptr != NULL) {
7164                                 s->nat_rule.ptr->packets[dirndx]++;
7165                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
7166                         }
7167                         if (s->src_node != NULL) {
7168                                 s->src_node->packets[dirndx]++;
7169                                 s->src_node->bytes[dirndx] += pd.tot_len;
7170                         }
7171                         if (s->nat_src_node != NULL) {
7172                                 s->nat_src_node->packets[dirndx]++;
7173                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
7174                         }
7175                         dirndx = (dir == s->direction) ? 0 : 1;
7176                         s->packets[dirndx]++;
7177                         s->bytes[dirndx] += pd.tot_len;
7178                 }
7179                 tr = r;
7180                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
7181                 if (nr != NULL && r == &pf_default_rule)
7182                         tr = nr;
7183                 if (tr->src.addr.type == PF_ADDR_TABLE)
7184                         pfr_update_stats(tr->src.addr.p.tbl,
7185                             (s == NULL) ? pd.src :
7186                             &s->key[(s->direction == PF_IN)]->addr[0],
7187                             pd.af, pd.tot_len, dir == PF_OUT,
7188                             r->action == PF_PASS, tr->src.neg);
7189                 if (tr->dst.addr.type == PF_ADDR_TABLE)
7190                         pfr_update_stats(tr->dst.addr.p.tbl,
7191                             (s == NULL) ? pd.dst :
7192                             &s->key[(s->direction == PF_IN)]->addr[1],
7193                             pd.af, pd.tot_len, dir == PF_OUT,
7194                             r->action == PF_PASS, tr->dst.neg);
7195         }
7196
7197
7198         if (action == PF_SYNPROXY_DROP) {
7199                 m_freem(*m0);
7200                 *m0 = NULL;
7201                 action = PF_PASS;
7202         } else if (r->rt)
7203                 /* pf_route6 can free the mbuf causing *m0 to become NULL */
7204                 pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
7205
7206         return (action);
7207 }
7208 #endif /* INET6 */
7209
7210 int
7211 pf_check_congestion(struct ifqueue *ifq)
7212 {
7213                 return (0);
7214 }