1 /* $OpenBSD: pf_norm.c,v 1.113 2008/05/07 07:07:29 markus Exp $ */
4 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
6 * Copyright 2001 Niels Provos <provos@citi.umich.edu>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #include "opt_inet6.h"
33 #include <sys/param.h>
34 #include <sys/systm.h>
36 #include <sys/filio.h>
37 #include <sys/fcntl.h>
38 #include <sys/socket.h>
39 #include <sys/kernel.h>
43 #include <net/if_types.h>
45 #include <net/route.h>
46 #include <net/pf/if_pflog.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/in_systm.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip_var.h>
53 #include <netinet/tcp.h>
54 #include <netinet/tcp_seq.h>
55 #include <netinet/udp.h>
56 #include <netinet/ip_icmp.h>
59 #include <netinet/ip6.h>
62 #include <net/pf/pfvar.h>
64 #define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */
65 #define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */
66 #define PFFRAG_DROP 0x0004 /* Drop all fragments */
67 #define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER))
70 TAILQ_HEAD(pf_fragqueue
, pf_fragment
) pf_fragqueue
[MAXCPU
];
71 TAILQ_HEAD(pf_cachequeue
, pf_fragment
) pf_cachequeue
[MAXCPU
];
73 static __inline
int pf_frag_compare(struct pf_fragment
*,
74 struct pf_fragment
*);
75 RB_HEAD(pf_frag_tree
, pf_fragment
) pf_frag_tree
[MAXCPU
],
76 pf_cache_tree
[MAXCPU
];
77 RB_PROTOTYPE(pf_frag_tree
, pf_fragment
, fr_entry
, pf_frag_compare
);
78 RB_GENERATE(pf_frag_tree
, pf_fragment
, fr_entry
, pf_frag_compare
);
80 /* Private prototypes */
81 void pf_ip2key(struct pf_fragment
*, struct ip
*);
82 void pf_remove_fragment(struct pf_fragment
*);
83 void pf_flush_fragments(void);
84 void pf_free_fragment(struct pf_fragment
*);
85 struct pf_fragment
*pf_find_fragment(struct ip
*, struct pf_frag_tree
*);
86 struct mbuf
*pf_reassemble(struct mbuf
**, struct pf_fragment
**,
87 struct pf_frent
*, int);
88 struct mbuf
*pf_fragcache(struct mbuf
**, struct ip
*,
89 struct pf_fragment
**, int, int, int *);
90 int pf_normalize_tcpopt(struct pf_rule
*, struct mbuf
*,
91 struct tcphdr
*, int, sa_family_t
);
93 #define DPFPRINTF(x) do { \
94 if (pf_status.debug >= PF_DEBUG_MISC) { \
95 kprintf("%s: ", __func__); \
100 static MALLOC_DEFINE(M_PFFRAGPL
, "pffrag", "pf fragment pool list");
101 static MALLOC_DEFINE(M_PFCACHEPL
, "pffrcache", "pf fragment cache pool list");
102 static MALLOC_DEFINE(M_PFFRENTPL
, "pffrent", "pf frent pool list");
103 static MALLOC_DEFINE(M_PFCENTPL
, "pffrcent", "pf fragment cent pool list");
104 static MALLOC_DEFINE(M_PFSTATESCRUBPL
, "pfstatescrub", "pf state scrub pool list");
107 struct malloc_type
*pf_frent_pl
, *pf_frag_pl
, *pf_cache_pl
, *pf_cent_pl
;
108 struct malloc_type
*pf_state_scrub_pl
;
109 int pf_nfrents
, pf_ncache
;
112 pf_normalize_init(void)
117 pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT);
118 pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
119 pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0);
120 pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0);
123 for (n
= 0; n
< MAXCPU
; ++n
) {
124 TAILQ_INIT(&pf_fragqueue
[n
]);
125 TAILQ_INIT(&pf_cachequeue
[n
]);
126 RB_INIT(&pf_frag_tree
[n
]);
127 RB_INIT(&pf_cache_tree
[n
]);
132 pf_frag_compare(struct pf_fragment
*a
, struct pf_fragment
*b
)
136 if ((diff
= a
->fr_id
- b
->fr_id
))
138 else if ((diff
= a
->fr_p
- b
->fr_p
))
140 else if (a
->fr_src
.s_addr
< b
->fr_src
.s_addr
)
142 else if (a
->fr_src
.s_addr
> b
->fr_src
.s_addr
)
144 else if (a
->fr_dst
.s_addr
< b
->fr_dst
.s_addr
)
146 else if (a
->fr_dst
.s_addr
> b
->fr_dst
.s_addr
)
152 pf_purge_expired_fragments(void)
154 struct pf_fragment
*frag
;
156 int cpu
= mycpu
->gd_cpuid
;
158 expire
= time_second
- pf_default_rule
.timeout
[PFTM_FRAG
];
160 while ((frag
= TAILQ_LAST(&pf_fragqueue
[cpu
], pf_fragqueue
)) != NULL
) {
161 KASSERT((BUFFER_FRAGMENTS(frag
)),
162 ("BUFFER_FRAGMENTS(frag) == 0: %s", __func__
));
163 if (frag
->fr_timeout
> expire
)
166 DPFPRINTF(("expiring %d(%p)\n", frag
->fr_id
, frag
));
167 pf_free_fragment(frag
);
170 while ((frag
= TAILQ_LAST(&pf_cachequeue
[cpu
], pf_cachequeue
)) != NULL
) {
171 KASSERT((!BUFFER_FRAGMENTS(frag
)),
172 ("BUFFER_FRAGMENTS(frag) != 0: %s", __func__
));
173 if (frag
->fr_timeout
> expire
)
176 DPFPRINTF(("expiring %d(%p)\n", frag
->fr_id
, frag
));
177 pf_free_fragment(frag
);
178 KASSERT((TAILQ_EMPTY(&pf_cachequeue
[cpu
]) ||
179 TAILQ_LAST(&pf_cachequeue
[cpu
], pf_cachequeue
) != frag
),
180 ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
186 * Try to flush old fragments to make space for new ones
190 pf_flush_fragments(void)
192 struct pf_fragment
*frag
;
194 int cpu
= mycpu
->gd_cpuid
;
196 goal
= pf_nfrents
* 9 / 10;
197 DPFPRINTF(("trying to free > %d frents\n",
199 while (goal
< pf_nfrents
) {
200 frag
= TAILQ_LAST(&pf_fragqueue
[cpu
], pf_fragqueue
);
203 pf_free_fragment(frag
);
207 goal
= pf_ncache
* 9 / 10;
208 DPFPRINTF(("trying to free > %d cache entries\n",
210 while (goal
< pf_ncache
) {
211 frag
= TAILQ_LAST(&pf_cachequeue
[cpu
], pf_cachequeue
);
214 pf_free_fragment(frag
);
218 /* Frees the fragments and all associated entries */
221 pf_free_fragment(struct pf_fragment
*frag
)
223 struct pf_frent
*frent
;
224 struct pf_frcache
*frcache
;
226 /* Free all fragments */
227 if (BUFFER_FRAGMENTS(frag
)) {
228 for (frent
= LIST_FIRST(&frag
->fr_queue
); frent
;
229 frent
= LIST_FIRST(&frag
->fr_queue
)) {
230 LIST_REMOVE(frent
, fr_next
);
232 m_freem(frent
->fr_m
);
233 kfree(frent
, M_PFFRENTPL
);
237 for (frcache
= LIST_FIRST(&frag
->fr_cache
); frcache
;
238 frcache
= LIST_FIRST(&frag
->fr_cache
)) {
239 LIST_REMOVE(frcache
, fr_next
);
241 KASSERT((LIST_EMPTY(&frag
->fr_cache
) ||
242 LIST_FIRST(&frag
->fr_cache
)->fr_off
>
244 ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >"
245 " frcache->fr_end): %s", __func__
));
247 kfree(frcache
, M_PFCENTPL
);
252 pf_remove_fragment(frag
);
256 pf_ip2key(struct pf_fragment
*key
, struct ip
*ip
)
258 key
->fr_p
= ip
->ip_p
;
259 key
->fr_id
= ip
->ip_id
;
260 key
->fr_src
.s_addr
= ip
->ip_src
.s_addr
;
261 key
->fr_dst
.s_addr
= ip
->ip_dst
.s_addr
;
265 pf_find_fragment(struct ip
*ip
, struct pf_frag_tree
*tree
)
267 struct pf_fragment key
;
268 struct pf_fragment
*frag
;
269 int cpu
= mycpu
->gd_cpuid
;
273 frag
= RB_FIND(pf_frag_tree
, tree
, &key
);
275 /* XXX Are we sure we want to update the timeout? */
276 frag
->fr_timeout
= time_second
;
277 if (BUFFER_FRAGMENTS(frag
)) {
278 TAILQ_REMOVE(&pf_fragqueue
[cpu
], frag
, frag_next
);
279 TAILQ_INSERT_HEAD(&pf_fragqueue
[cpu
], frag
, frag_next
);
281 TAILQ_REMOVE(&pf_cachequeue
[cpu
], frag
, frag_next
);
282 TAILQ_INSERT_HEAD(&pf_cachequeue
[cpu
], frag
, frag_next
);
289 /* Removes a fragment from the fragment queue and frees the fragment */
292 pf_remove_fragment(struct pf_fragment
*frag
)
294 int cpu
= mycpu
->gd_cpuid
;
296 if (BUFFER_FRAGMENTS(frag
)) {
297 RB_REMOVE(pf_frag_tree
, &pf_frag_tree
[cpu
], frag
);
298 TAILQ_REMOVE(&pf_fragqueue
[cpu
], frag
, frag_next
);
299 kfree(frag
, M_PFFRAGPL
);
301 RB_REMOVE(pf_frag_tree
, &pf_cache_tree
[cpu
], frag
);
302 TAILQ_REMOVE(&pf_cachequeue
[cpu
], frag
, frag_next
);
303 kfree(frag
, M_PFCACHEPL
);
307 #define FR_IP_OFF(fr) (((fr)->fr_ip->ip_off & IP_OFFMASK) << 3)
309 pf_reassemble(struct mbuf
**m0
, struct pf_fragment
**frag
,
310 struct pf_frent
*frent
, int mff
)
312 struct mbuf
*m
= *m0
, *m2
;
313 struct pf_frent
*frea
, *next
;
314 struct pf_frent
*frep
= NULL
;
315 struct ip
*ip
= frent
->fr_ip
;
316 int hlen
= ip
->ip_hl
<< 2;
317 u_int16_t off
= (ip
->ip_off
& IP_OFFMASK
) << 3;
318 u_int16_t ip_len
= ip
->ip_len
- ip
->ip_hl
* 4;
319 u_int16_t max
= ip_len
+ off
;
320 int cpu
= mycpu
->gd_cpuid
;
322 KASSERT((*frag
== NULL
|| BUFFER_FRAGMENTS(*frag
)),
323 ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __func__
));
325 /* Strip off ip header */
329 /* Create a new reassembly queue for this packet */
331 *frag
= kmalloc(sizeof(struct pf_fragment
), M_PFFRAGPL
, M_NOWAIT
);
333 pf_flush_fragments();
334 *frag
= kmalloc(sizeof(struct pf_fragment
), M_PFFRAGPL
, M_NOWAIT
);
339 (*frag
)->fr_flags
= 0;
341 (*frag
)->fr_src
= frent
->fr_ip
->ip_src
;
342 (*frag
)->fr_dst
= frent
->fr_ip
->ip_dst
;
343 (*frag
)->fr_p
= frent
->fr_ip
->ip_p
;
344 (*frag
)->fr_id
= frent
->fr_ip
->ip_id
;
345 (*frag
)->fr_timeout
= time_second
;
346 LIST_INIT(&(*frag
)->fr_queue
);
348 RB_INSERT(pf_frag_tree
, &pf_frag_tree
[cpu
], *frag
);
349 TAILQ_INSERT_HEAD(&pf_fragqueue
[cpu
], *frag
, frag_next
);
351 /* We do not have a previous fragment */
357 * Find a fragment after the current one:
358 * - off contains the real shifted offset.
360 LIST_FOREACH(frea
, &(*frag
)->fr_queue
, fr_next
) {
361 if (FR_IP_OFF(frea
) > off
)
366 KASSERT((frep
!= NULL
|| frea
!= NULL
),
367 ("!(frep != NULL || frea != NULL): %s", __func__
));
370 FR_IP_OFF(frep
) + frep
->fr_ip
->ip_len
- frep
->fr_ip
->ip_hl
*
375 precut
= FR_IP_OFF(frep
) + frep
->fr_ip
->ip_len
-
376 frep
->fr_ip
->ip_hl
* 4 - off
;
377 if (precut
>= ip_len
)
379 m_adj(frent
->fr_m
, precut
);
380 DPFPRINTF(("overlap -%d\n", precut
));
381 /* Enforce 8 byte boundaries */
382 ip
->ip_off
= ip
->ip_off
+ (precut
>> 3);
383 off
= (ip
->ip_off
& IP_OFFMASK
) << 3;
388 for (; frea
!= NULL
&& ip_len
+ off
> FR_IP_OFF(frea
);
393 aftercut
= ip_len
+ off
- FR_IP_OFF(frea
);
394 DPFPRINTF(("adjust overlap %d\n", aftercut
));
395 if (aftercut
< frea
->fr_ip
->ip_len
- frea
->fr_ip
->ip_hl
398 frea
->fr_ip
->ip_len
=
399 frea
->fr_ip
->ip_len
- aftercut
;
400 frea
->fr_ip
->ip_off
= frea
->fr_ip
->ip_off
+
402 m_adj(frea
->fr_m
, aftercut
);
406 /* This fragment is completely overlapped, lose it */
407 next
= LIST_NEXT(frea
, fr_next
);
409 LIST_REMOVE(frea
, fr_next
);
410 kfree(frea
, M_PFFRENTPL
);
415 /* Update maximum data size */
416 if ((*frag
)->fr_max
< max
)
417 (*frag
)->fr_max
= max
;
418 /* This is the last segment */
420 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
423 LIST_INSERT_HEAD(&(*frag
)->fr_queue
, frent
, fr_next
);
425 LIST_INSERT_AFTER(frep
, frent
, fr_next
);
427 /* Check if we are completely reassembled */
428 if (!((*frag
)->fr_flags
& PFFRAG_SEENLAST
))
431 /* Check if we have all the data */
433 for (frep
= LIST_FIRST(&(*frag
)->fr_queue
); frep
; frep
= next
) {
434 next
= LIST_NEXT(frep
, fr_next
);
436 off
+= frep
->fr_ip
->ip_len
- frep
->fr_ip
->ip_hl
* 4;
437 if (off
< (*frag
)->fr_max
&&
438 (next
== NULL
|| FR_IP_OFF(next
) != off
))
440 DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
441 off
, next
== NULL
? -1 : FR_IP_OFF(next
),
446 DPFPRINTF(("%d < %d?\n", off
, (*frag
)->fr_max
));
447 if (off
< (*frag
)->fr_max
)
450 /* We have all the data */
451 frent
= LIST_FIRST(&(*frag
)->fr_queue
);
452 KASSERT((frent
!= NULL
), ("frent == NULL: %s", __func__
));
453 if ((frent
->fr_ip
->ip_hl
<< 2) + off
> IP_MAXPACKET
) {
454 DPFPRINTF(("drop: too big: %d\n", off
));
455 pf_free_fragment(*frag
);
459 next
= LIST_NEXT(frent
, fr_next
);
461 /* Magic from ip_input */
467 kfree(frent
, M_PFFRENTPL
);
469 for (frent
= next
; frent
!= NULL
; frent
= next
) {
470 next
= LIST_NEXT(frent
, fr_next
);
473 kfree(frent
, M_PFFRENTPL
);
478 ip
->ip_src
= (*frag
)->fr_src
;
479 ip
->ip_dst
= (*frag
)->fr_dst
;
481 /* Remove from fragment queue */
482 pf_remove_fragment(*frag
);
485 hlen
= ip
->ip_hl
<< 2;
486 ip
->ip_len
= off
+ hlen
;
490 /* some debugging cruft by sklower, below, will go away soon */
491 /* XXX this should be done elsewhere */
492 if (m
->m_flags
& M_PKTHDR
) {
494 for (m2
= m
; m2
; m2
= m2
->m_next
)
496 m
->m_pkthdr
.len
= plen
;
499 DPFPRINTF(("complete: %p(%d)\n", m
, ip
->ip_len
));
503 /* Oops - fail safe - drop packet */
504 kfree(frent
, M_PFFRENTPL
);
511 pf_fragcache(struct mbuf
**m0
, struct ip
*h
, struct pf_fragment
**frag
, int mff
,
512 int drop
, int *nomem
)
514 struct mbuf
*m
= *m0
;
515 struct pf_frcache
*frp
, *fra
, *cur
= NULL
;
516 int ip_len
= h
->ip_len
- (h
->ip_hl
<< 2);
517 u_int16_t off
= h
->ip_off
<< 3;
518 u_int16_t max
= ip_len
+ off
;
520 int cpu
= mycpu
->gd_cpuid
;
522 KASSERT((*frag
== NULL
|| !BUFFER_FRAGMENTS(*frag
)),
523 ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __func__
));
525 /* Create a new range queue for this packet */
527 *frag
= kmalloc(sizeof(struct pf_fragment
), M_PFCACHEPL
, M_NOWAIT
);
529 pf_flush_fragments();
530 *frag
= kmalloc(sizeof(struct pf_fragment
), M_PFCACHEPL
, M_NOWAIT
);
535 /* Get an entry for the queue */
536 cur
= kmalloc(sizeof(struct pf_frcache
), M_PFCENTPL
, M_NOWAIT
);
538 kfree(*frag
, M_PFCACHEPL
);
544 (*frag
)->fr_flags
= PFFRAG_NOBUFFER
;
546 (*frag
)->fr_src
= h
->ip_src
;
547 (*frag
)->fr_dst
= h
->ip_dst
;
548 (*frag
)->fr_p
= h
->ip_p
;
549 (*frag
)->fr_id
= h
->ip_id
;
550 (*frag
)->fr_timeout
= time_second
;
554 LIST_INIT(&(*frag
)->fr_cache
);
555 LIST_INSERT_HEAD(&(*frag
)->fr_cache
, cur
, fr_next
);
557 RB_INSERT(pf_frag_tree
, &pf_cache_tree
[cpu
], *frag
);
558 TAILQ_INSERT_HEAD(&pf_cachequeue
[cpu
], *frag
, frag_next
);
560 DPFPRINTF(("fragcache[%d]: new %d-%d\n", h
->ip_id
, off
, max
));
566 * Find a fragment after the current one:
567 * - off contains the real shifted offset.
570 LIST_FOREACH(fra
, &(*frag
)->fr_cache
, fr_next
) {
571 if (fra
->fr_off
> off
)
576 KASSERT((frp
!= NULL
|| fra
!= NULL
),
577 ("!(frp != NULL || fra != NULL): %s", __func__
));
582 precut
= frp
->fr_end
- off
;
583 if (precut
>= ip_len
) {
584 /* Fragment is entirely a duplicate */
585 DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
586 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
, max
));
590 /* They are adjacent. Fixup cache entry */
591 DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
592 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
, max
));
594 } else if (precut
> 0) {
595 /* The first part of this payload overlaps with a
596 * fragment that has already been passed.
597 * Need to trim off the first part of the payload.
598 * But to do so easily, we need to create another
599 * mbuf to throw the original header into.
602 DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
603 h
->ip_id
, precut
, frp
->fr_off
, frp
->fr_end
, off
,
608 /* Update the previous frag to encompass this one */
612 /* XXX Optimization opportunity
613 * This is a very heavy way to trim the payload.
614 * we could do it much faster by diddling mbuf
615 * internals but that would be even less legible
616 * than this mbuf magic. For my next trick,
617 * I'll pull a rabbit out of my laptop.
619 *m0
= m_dup(m
, M_NOWAIT
);
620 /* From KAME Project : We have missed this! */
621 m_adj(*m0
, (h
->ip_hl
<< 2) -
622 (*m0
)->m_pkthdr
.len
);
625 KASSERT(((*m0
)->m_next
== NULL
),
626 ("(*m0)->m_next != NULL: %s",
628 m_adj(m
, precut
+ (h
->ip_hl
<< 2));
631 if (m
->m_flags
& M_PKTHDR
) {
634 for (t
= m
; t
; t
= t
->m_next
)
636 m
->m_pkthdr
.len
= plen
;
640 h
= mtod(m
, struct ip
*);
642 KASSERT(((int)m
->m_len
==
644 ("m->m_len != h->ip_len - precut: %s",
646 h
->ip_off
= h
->ip_off
+
648 h
->ip_len
= h
->ip_len
- precut
;
653 /* There is a gap between fragments */
655 DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
656 h
->ip_id
, -precut
, frp
->fr_off
, frp
->fr_end
, off
,
659 cur
= kmalloc(sizeof(struct pf_frcache
), M_PFCENTPL
, M_NOWAIT
);
666 LIST_INSERT_AFTER(frp
, cur
, fr_next
);
674 aftercut
= max
- fra
->fr_off
;
676 /* Adjacent fragments */
677 DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
678 h
->ip_id
, off
, max
, fra
->fr_off
, fra
->fr_end
));
681 } else if (aftercut
> 0) {
682 /* Need to chop off the tail of this fragment */
683 DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
684 h
->ip_id
, aftercut
, off
, max
, fra
->fr_off
,
693 if (m
->m_flags
& M_PKTHDR
) {
696 for (t
= m
; t
; t
= t
->m_next
)
698 m
->m_pkthdr
.len
= plen
;
700 h
= mtod(m
, struct ip
*);
701 KASSERT(((int)m
->m_len
== h
->ip_len
- aftercut
),
702 ("m->m_len != h->ip_len - aftercut: %s",
704 h
->ip_len
= h
->ip_len
- aftercut
;
708 } else if (frp
== NULL
) {
709 /* There is a gap between fragments */
710 DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
711 h
->ip_id
, -aftercut
, off
, max
, fra
->fr_off
,
714 cur
= kmalloc(sizeof(struct pf_frcache
), M_PFCENTPL
, M_NOWAIT
);
721 LIST_INSERT_BEFORE(fra
, cur
, fr_next
);
725 /* Need to glue together two separate fragment descriptors */
727 if (cur
&& fra
->fr_off
<= cur
->fr_end
) {
728 /* Need to merge in a previous 'cur' */
729 DPFPRINTF(("fragcache[%d]: adjacent(merge "
730 "%d-%d) %d-%d (%d-%d)\n",
731 h
->ip_id
, cur
->fr_off
, cur
->fr_end
, off
,
732 max
, fra
->fr_off
, fra
->fr_end
));
733 fra
->fr_off
= cur
->fr_off
;
734 LIST_REMOVE(cur
, fr_next
);
735 kfree(cur
, M_PFCENTPL
);
739 } else if (frp
&& fra
->fr_off
<= frp
->fr_end
) {
740 /* Need to merge in a modified 'frp' */
741 KASSERT((cur
== NULL
), ("cur != NULL: %s",
743 DPFPRINTF(("fragcache[%d]: adjacent(merge "
744 "%d-%d) %d-%d (%d-%d)\n",
745 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
,
746 max
, fra
->fr_off
, fra
->fr_end
));
747 fra
->fr_off
= frp
->fr_off
;
748 LIST_REMOVE(frp
, fr_next
);
749 kfree(frp
, M_PFCENTPL
);
759 * We must keep tracking the overall fragment even when
760 * we're going to drop it anyway so that we know when to
761 * free the overall descriptor. Thus we drop the frag late.
768 /* Update maximum data size */
769 if ((*frag
)->fr_max
< max
)
770 (*frag
)->fr_max
= max
;
772 /* This is the last segment */
774 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
776 /* Check if we are completely reassembled */
777 if (((*frag
)->fr_flags
& PFFRAG_SEENLAST
) &&
778 LIST_FIRST(&(*frag
)->fr_cache
)->fr_off
== 0 &&
779 LIST_FIRST(&(*frag
)->fr_cache
)->fr_end
== (*frag
)->fr_max
) {
780 /* Remove from fragment queue */
781 DPFPRINTF(("fragcache[%d]: done 0-%d\n", h
->ip_id
,
783 pf_free_fragment(*frag
);
792 /* Still need to pay attention to !IP_MF */
793 if (!mff
&& *frag
!= NULL
)
794 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
801 /* Still need to pay attention to !IP_MF */
802 if (!mff
&& *frag
!= NULL
)
803 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
806 /* This fragment has been deemed bad. Don't reass */
807 if (((*frag
)->fr_flags
& PFFRAG_DROP
) == 0)
808 DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
810 (*frag
)->fr_flags
|= PFFRAG_DROP
;
818 pf_normalize_ip(struct mbuf
**m0
, int dir
, struct pfi_kif
*kif
, u_short
*reason
,
821 struct mbuf
*m
= *m0
;
823 struct pf_frent
*frent
;
824 struct pf_fragment
*frag
= NULL
;
825 struct ip
*h
= mtod(m
, struct ip
*);
826 int mff
= (h
->ip_off
& IP_MF
);
827 int hlen
= h
->ip_hl
<< 2;
828 u_int16_t fragoff
= (h
->ip_off
& IP_OFFMASK
) << 3;
832 int cpu
= mycpu
->gd_cpuid
;
834 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
837 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
)
838 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
839 else if (r
->direction
&& r
->direction
!= dir
)
840 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
841 else if (r
->af
&& r
->af
!= AF_INET
)
842 r
= r
->skip
[PF_SKIP_AF
].ptr
;
843 else if (r
->proto
&& r
->proto
!= h
->ip_p
)
844 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
845 else if (PF_MISMATCHAW(&r
->src
.addr
,
846 (struct pf_addr
*)&h
->ip_src
.s_addr
, AF_INET
,
848 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
849 else if (PF_MISMATCHAW(&r
->dst
.addr
,
850 (struct pf_addr
*)&h
->ip_dst
.s_addr
, AF_INET
,
852 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
853 else if (r
->match_tag
&& !pf_match_tag(m
, r
, &tag
))
854 r
= TAILQ_NEXT(r
, entries
);
859 if (r
== NULL
|| r
->action
== PF_NOSCRUB
)
862 r
->packets
[dir
== PF_OUT
]++;
863 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
866 /* Check for illegal packets */
867 if (hlen
< (int)sizeof(struct ip
))
870 if (hlen
> h
->ip_len
)
873 /* Clear IP_DF if the rule uses the no-df option */
874 if (r
->rule_flag
& PFRULE_NODF
&& h
->ip_off
& IP_DF
) {
875 u_int16_t ip_off
= h
->ip_off
;
878 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_off
, h
->ip_off
, 0);
881 /* We will need other tests here */
882 if (!fragoff
&& !mff
)
885 /* A fragment; rehash required. */
886 m
->m_flags
&= ~M_HASH
;
888 /* We're dealing with a fragment now. Don't allow fragments
889 * with IP_DF to enter the cache. If the flag was cleared by
890 * no-df above, fine. Otherwise drop it.
892 if (h
->ip_off
& IP_DF
) {
893 DPFPRINTF(("IP_DF\n"));
897 ip_len
= h
->ip_len
- hlen
;
899 /* All fragments are 8 byte aligned */
900 if (mff
&& (ip_len
& 0x7)) {
901 DPFPRINTF(("mff and %d\n", ip_len
));
905 /* Respect maximum length */
906 if (fragoff
+ ip_len
> IP_MAXPACKET
) {
907 DPFPRINTF(("max packet %d\n", fragoff
+ ip_len
));
910 max
= fragoff
+ ip_len
;
912 if ((r
->rule_flag
& (PFRULE_FRAGCROP
|PFRULE_FRAGDROP
)) == 0) {
913 /* Fully buffer all of the fragments */
915 frag
= pf_find_fragment(h
, &pf_frag_tree
[cpu
]);
917 /* Check if we saw the last fragment already */
918 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_SEENLAST
) &&
922 /* Get an entry for the fragment queue */
923 frent
= kmalloc(sizeof(struct pf_frent
), M_PFFRENTPL
, M_NOWAIT
);
925 REASON_SET(reason
, PFRES_MEMORY
);
932 /* Might return a completely reassembled mbuf, or NULL */
933 DPFPRINTF(("reass frag %d @ %d-%d\n", h
->ip_id
, fragoff
, max
));
934 *m0
= m
= pf_reassemble(m0
, &frag
, frent
, mff
);
939 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_DROP
))
942 h
= mtod(m
, struct ip
*);
944 /* non-buffering fragment cache (drops or masks overlaps) */
947 if (dir
== PF_OUT
&& m
->m_pkthdr
.pf
.flags
& PF_TAG_FRAGCACHE
) {
949 * Already passed the fragment cache in the
950 * input direction. If we continued, it would
951 * appear to be a dup and would be dropped.
956 frag
= pf_find_fragment(h
, &pf_cache_tree
[cpu
]);
958 /* Check if we saw the last fragment already */
959 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_SEENLAST
) &&
960 max
> frag
->fr_max
) {
961 if (r
->rule_flag
& PFRULE_FRAGDROP
)
962 frag
->fr_flags
|= PFFRAG_DROP
;
966 *m0
= m
= pf_fragcache(m0
, h
, &frag
, mff
,
967 (r
->rule_flag
& PFRULE_FRAGDROP
) ? 1 : 0, &nomem
);
975 m
->m_pkthdr
.pf
.flags
|= PF_TAG_FRAGCACHE
;
977 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_DROP
))
983 /* At this point, only IP_DF is allowed in ip_off */
984 if (h
->ip_off
& ~IP_DF
) {
985 u_int16_t ip_off
= h
->ip_off
;
988 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, htons(ip_off
), htons(h
->ip_off
), 0);
991 /* Enforce a minimum ttl, may cause endless packet loops */
992 if (r
->min_ttl
&& h
->ip_ttl
< r
->min_ttl
) {
993 u_int16_t ip_ttl
= h
->ip_ttl
;
995 h
->ip_ttl
= r
->min_ttl
;
996 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_ttl
, h
->ip_ttl
, 0);
1000 if (r
->rule_flag
& PFRULE_SET_TOS
) {
1003 ov
= *(u_int16_t
*)h
;
1004 h
->ip_tos
= r
->set_tos
;
1005 nv
= *(u_int16_t
*)h
;
1007 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ov
, nv
, 0);
1010 if (r
->rule_flag
& PFRULE_RANDOMID
) {
1011 u_int16_t ip_id
= h
->ip_id
;
1013 h
->ip_id
= ip_randomid();
1014 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_id
, h
->ip_id
, 0);
1016 if ((r
->rule_flag
& (PFRULE_FRAGCROP
|PFRULE_FRAGDROP
)) == 0)
1017 pd
->flags
|= PFDESC_IP_REAS
;
1022 /* Enforce a minimum ttl, may cause endless packet loops */
1023 if (r
->min_ttl
&& h
->ip_ttl
< r
->min_ttl
) {
1024 u_int16_t ip_ttl
= h
->ip_ttl
;
1026 h
->ip_ttl
= r
->min_ttl
;
1027 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_ttl
, h
->ip_ttl
, 0);
1030 if (r
->rule_flag
& PFRULE_SET_TOS
) {
1033 ov
= *(u_int16_t
*)h
;
1034 h
->ip_tos
= r
->set_tos
;
1035 nv
= *(u_int16_t
*)h
;
1037 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ov
, nv
, 0);
1039 if ((r
->rule_flag
& (PFRULE_FRAGCROP
|PFRULE_FRAGDROP
)) == 0)
1040 pd
->flags
|= PFDESC_IP_REAS
;
1044 REASON_SET(reason
, PFRES_MEMORY
);
1045 if (r
!= NULL
&& r
->log
)
1046 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1050 REASON_SET(reason
, PFRES_NORM
);
1051 if (r
!= NULL
&& r
->log
)
1052 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1056 DPFPRINTF(("dropping bad fragment\n"));
1058 /* Free associated fragments */
1060 pf_free_fragment(frag
);
1062 REASON_SET(reason
, PFRES_FRAG
);
1063 if (r
!= NULL
&& r
->log
)
1064 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1071 pf_normalize_ip6(struct mbuf
**m0
, int dir
, struct pfi_kif
*kif
,
1072 u_short
*reason
, struct pf_pdesc
*pd
)
1074 struct mbuf
*m
= *m0
;
1076 struct ip6_hdr
*h
= mtod(m
, struct ip6_hdr
*);
1080 struct ip6_opt_jumbo jumbo
;
1081 struct ip6_frag frag
;
1082 u_int32_t jumbolen
= 0, plen
;
1083 u_int16_t fragoff
= 0;
1089 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
1092 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
)
1093 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
1094 else if (r
->direction
&& r
->direction
!= dir
)
1095 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
1096 else if (r
->af
&& r
->af
!= AF_INET6
)
1097 r
= r
->skip
[PF_SKIP_AF
].ptr
;
1098 #if 0 /* header chain! */
1099 else if (r
->proto
&& r
->proto
!= h
->ip6_nxt
)
1100 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
1102 else if (PF_MISMATCHAW(&r
->src
.addr
,
1103 (struct pf_addr
*)&h
->ip6_src
, AF_INET6
,
1105 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
1106 else if (PF_MISMATCHAW(&r
->dst
.addr
,
1107 (struct pf_addr
*)&h
->ip6_dst
, AF_INET6
,
1109 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
1114 if (r
== NULL
|| r
->action
== PF_NOSCRUB
)
1117 r
->packets
[dir
== PF_OUT
]++;
1118 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
1121 /* Check for illegal packets */
1122 if (sizeof(struct ip6_hdr
) + IPV6_MAXPACKET
< m
->m_pkthdr
.len
)
1125 off
= sizeof(struct ip6_hdr
);
1130 case IPPROTO_FRAGMENT
:
1134 case IPPROTO_ROUTING
:
1135 case IPPROTO_DSTOPTS
:
1136 if (!pf_pull_hdr(m
, off
, &ext
, sizeof(ext
), NULL
,
1139 if (proto
== IPPROTO_AH
)
1140 off
+= (ext
.ip6e_len
+ 2) * 4;
1142 off
+= (ext
.ip6e_len
+ 1) * 8;
1143 proto
= ext
.ip6e_nxt
;
1145 case IPPROTO_HOPOPTS
:
1146 if (!pf_pull_hdr(m
, off
, &ext
, sizeof(ext
), NULL
,
1149 optend
= off
+ (ext
.ip6e_len
+ 1) * 8;
1150 ooff
= off
+ sizeof(ext
);
1152 if (!pf_pull_hdr(m
, ooff
, &opt
.ip6o_type
,
1153 sizeof(opt
.ip6o_type
), NULL
, NULL
,
1156 if (opt
.ip6o_type
== IP6OPT_PAD1
) {
1160 if (!pf_pull_hdr(m
, ooff
, &opt
, sizeof(opt
),
1161 NULL
, NULL
, AF_INET6
))
1163 if (ooff
+ sizeof(opt
) + opt
.ip6o_len
> optend
)
1165 switch (opt
.ip6o_type
) {
1167 if (h
->ip6_plen
!= 0)
1169 if (!pf_pull_hdr(m
, ooff
, &jumbo
,
1170 sizeof(jumbo
), NULL
, NULL
,
1173 memcpy(&jumbolen
, jumbo
.ip6oj_jumbo_len
,
1175 jumbolen
= ntohl(jumbolen
);
1176 if (jumbolen
<= IPV6_MAXPACKET
)
1178 if (sizeof(struct ip6_hdr
) + jumbolen
!=
1185 ooff
+= sizeof(opt
) + opt
.ip6o_len
;
1186 } while (ooff
< optend
);
1189 proto
= ext
.ip6e_nxt
;
1195 } while (!terminal
);
1197 /* jumbo payload option must be present, or plen > 0 */
1198 if (ntohs(h
->ip6_plen
) == 0)
1201 plen
= ntohs(h
->ip6_plen
);
1204 if (sizeof(struct ip6_hdr
) + plen
> m
->m_pkthdr
.len
)
1207 /* Enforce a minimum ttl, may cause endless packet loops */
1208 if (r
->min_ttl
&& h
->ip6_hlim
< r
->min_ttl
)
1209 h
->ip6_hlim
= r
->min_ttl
;
1214 if (ntohs(h
->ip6_plen
) == 0 || jumbolen
)
1216 plen
= ntohs(h
->ip6_plen
);
1218 if (!pf_pull_hdr(m
, off
, &frag
, sizeof(frag
), NULL
, NULL
, AF_INET6
))
1220 fragoff
= ntohs(frag
.ip6f_offlg
& IP6F_OFF_MASK
);
1221 if (fragoff
+ (plen
- off
- sizeof(frag
)) > IPV6_MAXPACKET
)
1224 /* do something about it */
1225 /* remember to set pd->flags |= PFDESC_IP_REAS */
1229 REASON_SET(reason
, PFRES_SHORT
);
1230 if (r
!= NULL
&& r
->log
)
1231 PFLOG_PACKET(kif
, h
, m
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1235 REASON_SET(reason
, PFRES_NORM
);
1236 if (r
!= NULL
&& r
->log
)
1237 PFLOG_PACKET(kif
, h
, m
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1241 REASON_SET(reason
, PFRES_FRAG
);
1242 if (r
!= NULL
&& r
->log
)
1243 PFLOG_PACKET(kif
, h
, m
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1249 pf_normalize_tcp(int dir
, struct pfi_kif
*kif
, struct mbuf
*m
, int ipoff
,
1250 int off
, void *h
, struct pf_pdesc
*pd
)
1252 struct pf_rule
*r
, *rm
= NULL
;
1253 struct tcphdr
*th
= pd
->hdr
.tcp
;
1257 sa_family_t af
= pd
->af
;
1259 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
1262 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
)
1263 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
1264 else if (r
->direction
&& r
->direction
!= dir
)
1265 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
1266 else if (r
->af
&& r
->af
!= af
)
1267 r
= r
->skip
[PF_SKIP_AF
].ptr
;
1268 else if (r
->proto
&& r
->proto
!= pd
->proto
)
1269 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
1270 else if (PF_MISMATCHAW(&r
->src
.addr
, pd
->src
, af
,
1272 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
1273 else if (r
->src
.port_op
&& !pf_match_port(r
->src
.port_op
,
1274 r
->src
.port
[0], r
->src
.port
[1], th
->th_sport
))
1275 r
= r
->skip
[PF_SKIP_SRC_PORT
].ptr
;
1276 else if (PF_MISMATCHAW(&r
->dst
.addr
, pd
->dst
, af
,
1278 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
1279 else if (r
->dst
.port_op
&& !pf_match_port(r
->dst
.port_op
,
1280 r
->dst
.port
[0], r
->dst
.port
[1], th
->th_dport
))
1281 r
= r
->skip
[PF_SKIP_DST_PORT
].ptr
;
1282 else if (r
->os_fingerprint
!= PF_OSFP_ANY
&& !pf_osfp_match(
1283 pf_osfp_fingerprint(pd
, m
, off
, th
),
1285 r
= TAILQ_NEXT(r
, entries
);
1292 if (rm
== NULL
|| rm
->action
== PF_NOSCRUB
)
1295 r
->packets
[dir
== PF_OUT
]++;
1296 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
1299 if (rm
->rule_flag
& PFRULE_REASSEMBLE_TCP
)
1300 pd
->flags
|= PFDESC_TCP_NORM
;
1302 flags
= th
->th_flags
;
1303 if (flags
& TH_SYN
) {
1304 /* Illegal packet */
1311 /* Illegal packet */
1312 if (!(flags
& (TH_ACK
|TH_RST
)))
1316 if (!(flags
& TH_ACK
)) {
1317 /* These flags are only valid if ACK is set */
1318 if ((flags
& TH_FIN
) || (flags
& TH_PUSH
) || (flags
& TH_URG
))
1322 /* Check for illegal header length */
1323 if (th
->th_off
< (sizeof(struct tcphdr
) >> 2))
1326 /* If flags changed, or reserved data set, then adjust */
1327 if (flags
!= th
->th_flags
|| th
->th_x2
!= 0) {
1330 ov
= *(u_int16_t
*)(&th
->th_ack
+ 1);
1331 th
->th_flags
= flags
;
1333 nv
= *(u_int16_t
*)(&th
->th_ack
+ 1);
1335 th
->th_sum
= pf_cksum_fixup(th
->th_sum
, ov
, nv
, 0);
1339 /* Remove urgent pointer, if TH_URG is not set */
1340 if (!(flags
& TH_URG
) && th
->th_urp
) {
1341 th
->th_sum
= pf_cksum_fixup(th
->th_sum
, th
->th_urp
, 0, 0);
1346 /* Process options */
1347 if (r
->max_mss
&& pf_normalize_tcpopt(r
, m
, th
, off
, pd
->af
))
1350 /* copy back packet headers if we sanitized */
1352 m_copyback(m
, off
, sizeof(*th
), (caddr_t
)th
);
1357 REASON_SET(&reason
, PFRES_NORM
);
1358 if (rm
!= NULL
&& r
->log
)
1359 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, reason
, r
, NULL
, NULL
, pd
);
1364 pf_normalize_tcp_init(struct mbuf
*m
, int off
, struct pf_pdesc
*pd
,
1365 struct tcphdr
*th
, struct pf_state_peer
*src
, struct pf_state_peer
*dst
)
1367 u_int32_t tsval
, tsecr
;
1371 KASSERT((src
->scrub
== NULL
),
1372 ("pf_normalize_tcp_init: src->scrub != NULL"));
1374 src
->scrub
= kmalloc(sizeof(struct pf_state_scrub
), M_PFSTATESCRUBPL
,
1376 if (src
->scrub
== NULL
)
1382 struct ip
*h
= mtod(m
, struct ip
*);
1383 src
->scrub
->pfss_ttl
= h
->ip_ttl
;
1389 struct ip6_hdr
*h
= mtod(m
, struct ip6_hdr
*);
1390 src
->scrub
->pfss_ttl
= h
->ip6_hlim
;
1398 * All normalizations below are only begun if we see the start of
1399 * the connections. They must all set an enabled bit in pfss_flags
1401 if ((th
->th_flags
& TH_SYN
) == 0)
1405 if (th
->th_off
> (sizeof(struct tcphdr
) >> 2) && src
->scrub
&&
1406 pf_pull_hdr(m
, off
, hdr
, th
->th_off
<< 2, NULL
, NULL
, pd
->af
)) {
1407 /* Diddle with TCP options */
1409 opt
= hdr
+ sizeof(struct tcphdr
);
1410 hlen
= (th
->th_off
<< 2) - sizeof(struct tcphdr
);
1411 while (hlen
>= TCPOLEN_TIMESTAMP
) {
1413 case TCPOPT_EOL
: /* FALLTHROUGH */
1418 case TCPOPT_TIMESTAMP
:
1419 if (opt
[1] >= TCPOLEN_TIMESTAMP
) {
1420 src
->scrub
->pfss_flags
|=
1422 src
->scrub
->pfss_ts_mod
= karc4random();
1424 /* note PFSS_PAWS not set yet */
1425 memcpy(&tsval
, &opt
[2],
1427 memcpy(&tsecr
, &opt
[6],
1429 src
->scrub
->pfss_tsval0
= ntohl(tsval
);
1430 src
->scrub
->pfss_tsval
= ntohl(tsval
);
1431 src
->scrub
->pfss_tsecr
= ntohl(tsecr
);
1432 getmicrouptime(&src
->scrub
->pfss_last
);
1436 hlen
-= MAX(opt
[1], 2);
1437 opt
+= MAX(opt
[1], 2);
1447 pf_normalize_tcp_cleanup(struct pf_state
*state
)
1449 if (state
->src
.scrub
)
1450 kfree(state
->src
.scrub
, M_PFSTATESCRUBPL
);
1451 if (state
->dst
.scrub
)
1452 kfree(state
->dst
.scrub
, M_PFSTATESCRUBPL
);
1454 /* Someday... flush the TCP segment reassembly descriptors. */
1458 pf_normalize_tcp_stateful(struct mbuf
*m
, int off
, struct pf_pdesc
*pd
,
1459 u_short
*reason
, struct tcphdr
*th
, struct pf_state
*state
,
1460 struct pf_state_peer
*src
, struct pf_state_peer
*dst
, int *writeback
)
1462 struct timeval uptime
;
1463 u_int32_t tsval
, tsecr
;
1464 u_int tsval_from_last
;
1470 KASSERT((src
->scrub
|| dst
->scrub
),
1471 ("pf_normalize_tcp_statefull: src->scrub && dst->scrub!"));
1474 * Enforce the minimum TTL seen for this connection. Negate a common
1475 * technique to evade an intrusion detection system and confuse
1476 * firewall state code.
1482 struct ip
*h
= mtod(m
, struct ip
*);
1483 if (h
->ip_ttl
> src
->scrub
->pfss_ttl
)
1484 src
->scrub
->pfss_ttl
= h
->ip_ttl
;
1485 h
->ip_ttl
= src
->scrub
->pfss_ttl
;
1493 struct ip6_hdr
*h
= mtod(m
, struct ip6_hdr
*);
1494 if (h
->ip6_hlim
> src
->scrub
->pfss_ttl
)
1495 src
->scrub
->pfss_ttl
= h
->ip6_hlim
;
1496 h
->ip6_hlim
= src
->scrub
->pfss_ttl
;
1503 if (th
->th_off
> (sizeof(struct tcphdr
) >> 2) &&
1504 ((src
->scrub
&& (src
->scrub
->pfss_flags
& PFSS_TIMESTAMP
)) ||
1505 (dst
->scrub
&& (dst
->scrub
->pfss_flags
& PFSS_TIMESTAMP
))) &&
1506 pf_pull_hdr(m
, off
, hdr
, th
->th_off
<< 2, NULL
, NULL
, pd
->af
)) {
1507 /* Diddle with TCP options */
1509 opt
= hdr
+ sizeof(struct tcphdr
);
1510 hlen
= (th
->th_off
<< 2) - sizeof(struct tcphdr
);
1511 while (hlen
>= TCPOLEN_TIMESTAMP
) {
1513 case TCPOPT_EOL
: /* FALLTHROUGH */
1518 case TCPOPT_TIMESTAMP
:
1519 /* Modulate the timestamps. Can be used for
1520 * NAT detection, OS uptime determination or
1525 /* Huh? Multiple timestamps!? */
1526 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1527 DPFPRINTF(("multiple TS??"));
1528 pf_print_state(state
);
1531 REASON_SET(reason
, PFRES_TS
);
1534 if (opt
[1] >= TCPOLEN_TIMESTAMP
) {
1535 memcpy(&tsval
, &opt
[2],
1537 if (tsval
&& src
->scrub
&&
1538 (src
->scrub
->pfss_flags
&
1540 tsval
= ntohl(tsval
);
1541 pf_change_a(&opt
[2],
1544 src
->scrub
->pfss_ts_mod
),
1549 /* Modulate TS reply iff valid (!0) */
1550 memcpy(&tsecr
, &opt
[6],
1552 if (tsecr
&& dst
->scrub
&&
1553 (dst
->scrub
->pfss_flags
&
1555 tsecr
= ntohl(tsecr
)
1556 - dst
->scrub
->pfss_ts_mod
;
1557 pf_change_a(&opt
[6],
1558 &th
->th_sum
, htonl(tsecr
),
1566 hlen
-= MAX(opt
[1], 2);
1567 opt
+= MAX(opt
[1], 2);
1572 /* Copyback the options, caller copys back header */
1574 m_copyback(m
, off
+ sizeof(struct tcphdr
),
1575 (th
->th_off
<< 2) - sizeof(struct tcphdr
), hdr
+
1576 sizeof(struct tcphdr
));
1582 * Must invalidate PAWS checks on connections idle for too long.
1583 * The fastest allowed timestamp clock is 1ms. That turns out to
1584 * be about 24 days before it wraps. XXX Right now our lowerbound
1585 * TS echo check only works for the first 12 days of a connection
1586 * when the TS has exhausted half its 32bit space
1588 #define TS_MAX_IDLE (24*24*60*60)
1589 #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */
1591 getmicrouptime(&uptime
);
1592 if (src
->scrub
&& (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1593 (uptime
.tv_sec
- src
->scrub
->pfss_last
.tv_sec
> TS_MAX_IDLE
||
1594 time_second
- state
->creation
> TS_MAX_CONN
)) {
1595 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1596 DPFPRINTF(("src idled out of PAWS\n"));
1597 pf_print_state(state
);
1600 src
->scrub
->pfss_flags
= (src
->scrub
->pfss_flags
& ~PFSS_PAWS
)
1603 if (dst
->scrub
&& (dst
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1604 uptime
.tv_sec
- dst
->scrub
->pfss_last
.tv_sec
> TS_MAX_IDLE
) {
1605 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1606 DPFPRINTF(("dst idled out of PAWS\n"));
1607 pf_print_state(state
);
1610 dst
->scrub
->pfss_flags
= (dst
->scrub
->pfss_flags
& ~PFSS_PAWS
)
1614 if (got_ts
&& src
->scrub
&& dst
->scrub
&&
1615 (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1616 (dst
->scrub
->pfss_flags
& PFSS_PAWS
)) {
1617 /* Validate that the timestamps are "in-window".
1618 * RFC1323 describes TCP Timestamp options that allow
1619 * measurement of RTT (round trip time) and PAWS
1620 * (protection against wrapped sequence numbers). PAWS
1621 * gives us a set of rules for rejecting packets on
1622 * long fat pipes (packets that were somehow delayed
1623 * in transit longer than the time it took to send the
1624 * full TCP sequence space of 4Gb). We can use these
1625 * rules and infer a few others that will let us treat
1626 * the 32bit timestamp and the 32bit echoed timestamp
1627 * as sequence numbers to prevent a blind attacker from
1628 * inserting packets into a connection.
1631 * - The timestamp on this packet must be greater than
1632 * or equal to the last value echoed by the other
1633 * endpoint. The RFC says those will be discarded
1634 * since it is a dup that has already been acked.
1635 * This gives us a lowerbound on the timestamp.
1636 * timestamp >= other last echoed timestamp
1637 * - The timestamp will be less than or equal to
1638 * the last timestamp plus the time between the
1639 * last packet and now. The RFC defines the max
1640 * clock rate as 1ms. We will allow clocks to be
1641 * up to 10% fast and will allow a total difference
1642 * or 30 seconds due to a route change. And this
1643 * gives us an upperbound on the timestamp.
1644 * timestamp <= last timestamp + max ticks
1645 * We have to be careful here. Windows will send an
1646 * initial timestamp of zero and then initialize it
1647 * to a random value after the 3whs; presumably to
1648 * avoid a DoS by having to call an expensive RNG
1649 * during a SYN flood. Proof MS has at least one
1650 * good security geek.
1652 * - The TCP timestamp option must also echo the other
1653 * endpoints timestamp. The timestamp echoed is the
1654 * one carried on the earliest unacknowledged segment
1655 * on the left edge of the sequence window. The RFC
1656 * states that the host will reject any echoed
1657 * timestamps that were larger than any ever sent.
1658 * This gives us an upperbound on the TS echo.
1659 * tescr <= largest_tsval
1660 * - The lowerbound on the TS echo is a little more
1661 * tricky to determine. The other endpoint's echoed
1662 * values will not decrease. But there may be
1663 * network conditions that re-order packets and
1664 * cause our view of them to decrease. For now the
1665 * only lowerbound we can safely determine is that
1666 * the TS echo will never be less than the original
1667 * TS. XXX There is probably a better lowerbound.
1668 * Remove TS_MAX_CONN with better lowerbound check.
1669 * tescr >= other original TS
1671 * It is also important to note that the fastest
1672 * timestamp clock of 1ms will wrap its 32bit space in
1673 * 24 days. So we just disable TS checking after 24
1674 * days of idle time. We actually must use a 12d
1675 * connection limit until we can come up with a better
1676 * lowerbound to the TS echo check.
1678 struct timeval delta_ts
;
1683 * PFTM_TS_DIFF is how many seconds of leeway to allow
1684 * a host's timestamp. This can happen if the previous
1685 * packet got delayed in transit for much longer than
1688 if ((ts_fudge
= state
->rule
.ptr
->timeout
[PFTM_TS_DIFF
]) == 0)
1689 ts_fudge
= pf_default_rule
.timeout
[PFTM_TS_DIFF
];
1692 /* Calculate max ticks since the last timestamp */
1693 #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */
1694 #define TS_MICROSECS 1000000 /* microseconds per second */
1696 #define timersub(tvp, uvp, vvp) \
1698 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
1699 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
1700 if ((vvp)->tv_usec < 0) { \
1702 (vvp)->tv_usec += 1000000; \
1707 timersub(&uptime
, &src
->scrub
->pfss_last
, &delta_ts
);
1708 tsval_from_last
= (delta_ts
.tv_sec
+ ts_fudge
) * TS_MAXFREQ
;
1709 tsval_from_last
+= delta_ts
.tv_usec
/ (TS_MICROSECS
/TS_MAXFREQ
);
1712 if ((src
->state
>= TCPS_ESTABLISHED
&&
1713 dst
->state
>= TCPS_ESTABLISHED
) &&
1714 (SEQ_LT(tsval
, dst
->scrub
->pfss_tsecr
) ||
1715 SEQ_GT(tsval
, src
->scrub
->pfss_tsval
+ tsval_from_last
) ||
1716 (tsecr
&& (SEQ_GT(tsecr
, dst
->scrub
->pfss_tsval
) ||
1717 SEQ_LT(tsecr
, dst
->scrub
->pfss_tsval0
))))) {
1718 /* Bad RFC1323 implementation or an insertion attack.
1720 * - Solaris 2.6 and 2.7 are known to send another ACK
1721 * after the FIN,FIN|ACK,ACK closing that carries
1725 DPFPRINTF(("Timestamp failed %c%c%c%c\n",
1726 SEQ_LT(tsval
, dst
->scrub
->pfss_tsecr
) ? '0' : ' ',
1727 SEQ_GT(tsval
, src
->scrub
->pfss_tsval
+
1728 tsval_from_last
) ? '1' : ' ',
1729 SEQ_GT(tsecr
, dst
->scrub
->pfss_tsval
) ? '2' : ' ',
1730 SEQ_LT(tsecr
, dst
->scrub
->pfss_tsval0
)? '3' : ' '));
1731 DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u "
1732 "idle: %lus %lums\n",
1733 tsval
, tsecr
, tsval_from_last
, delta_ts
.tv_sec
,
1734 delta_ts
.tv_usec
/ 1000));
1735 DPFPRINTF((" src->tsval: %u tsecr: %u\n",
1736 src
->scrub
->pfss_tsval
, src
->scrub
->pfss_tsecr
));
1737 DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u"
1738 "\n", dst
->scrub
->pfss_tsval
,
1739 dst
->scrub
->pfss_tsecr
, dst
->scrub
->pfss_tsval0
));
1740 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1741 pf_print_state(state
);
1742 pf_print_flags(th
->th_flags
);
1745 REASON_SET(reason
, PFRES_TS
);
1749 /* XXX I'd really like to require tsecr but it's optional */
1751 } else if (!got_ts
&& (th
->th_flags
& TH_RST
) == 0 &&
1752 ((src
->state
== TCPS_ESTABLISHED
&& dst
->state
== TCPS_ESTABLISHED
)
1753 || pd
->p_len
> 0 || (th
->th_flags
& TH_SYN
)) &&
1754 src
->scrub
&& dst
->scrub
&&
1755 (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1756 (dst
->scrub
->pfss_flags
& PFSS_PAWS
)) {
1757 /* Didn't send a timestamp. Timestamps aren't really useful
1759 * - connection opening or closing (often not even sent).
1760 * but we must not let an attacker to put a FIN on a
1761 * data packet to sneak it through our ESTABLISHED check.
1762 * - on a TCP reset. RFC suggests not even looking at TS.
1763 * - on an empty ACK. The TS will not be echoed so it will
1764 * probably not help keep the RTT calculation in sync and
1765 * there isn't as much danger when the sequence numbers
1766 * got wrapped. So some stacks don't include TS on empty
1769 * To minimize the disruption to mostly RFC1323 conformant
1770 * stacks, we will only require timestamps on data packets.
1772 * And what do ya know, we cannot require timestamps on data
1773 * packets. There appear to be devices that do legitimate
1774 * TCP connection hijacking. There are HTTP devices that allow
1775 * a 3whs (with timestamps) and then buffer the HTTP request.
1776 * If the intermediate device has the HTTP response cache, it
1777 * will spoof the response but not bother timestamping its
1778 * packets. So we can look for the presence of a timestamp in
1779 * the first data packet and if there, require it in all future
1783 if (pd
->p_len
> 0 && (src
->scrub
->pfss_flags
& PFSS_DATA_TS
)) {
1785 * Hey! Someone tried to sneak a packet in. Or the
1786 * stack changed its RFC1323 behavior?!?!
1788 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1789 DPFPRINTF(("Did not receive expected RFC1323 "
1791 pf_print_state(state
);
1792 pf_print_flags(th
->th_flags
);
1795 REASON_SET(reason
, PFRES_TS
);
1802 * We will note if a host sends his data packets with or without
1803 * timestamps. And require all data packets to contain a timestamp
1804 * if the first does. PAWS implicitly requires that all data packets be
1805 * timestamped. But I think there are middle-man devices that hijack
1806 * TCP streams immediately after the 3whs and don't timestamp their
1807 * packets (seen in a WWW accelerator or cache).
1809 if (pd
->p_len
> 0 && src
->scrub
&& (src
->scrub
->pfss_flags
&
1810 (PFSS_TIMESTAMP
|PFSS_DATA_TS
|PFSS_DATA_NOTS
)) == PFSS_TIMESTAMP
) {
1812 src
->scrub
->pfss_flags
|= PFSS_DATA_TS
;
1814 src
->scrub
->pfss_flags
|= PFSS_DATA_NOTS
;
1815 if (pf_status
.debug
>= PF_DEBUG_MISC
&& dst
->scrub
&&
1816 (dst
->scrub
->pfss_flags
& PFSS_TIMESTAMP
)) {
1817 /* Don't warn if other host rejected RFC1323 */
1818 DPFPRINTF(("Broken RFC1323 stack did not "
1819 "timestamp data packet. Disabled PAWS "
1821 pf_print_state(state
);
1822 pf_print_flags(th
->th_flags
);
1830 * Update PAWS values
1832 if (got_ts
&& src
->scrub
&& PFSS_TIMESTAMP
== (src
->scrub
->pfss_flags
&
1833 (PFSS_PAWS_IDLED
|PFSS_TIMESTAMP
))) {
1834 getmicrouptime(&src
->scrub
->pfss_last
);
1835 if (SEQ_GEQ(tsval
, src
->scrub
->pfss_tsval
) ||
1836 (src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0)
1837 src
->scrub
->pfss_tsval
= tsval
;
1840 if (SEQ_GEQ(tsecr
, src
->scrub
->pfss_tsecr
) ||
1841 (src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0)
1842 src
->scrub
->pfss_tsecr
= tsecr
;
1844 if ((src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0 &&
1845 (SEQ_LT(tsval
, src
->scrub
->pfss_tsval0
) ||
1846 src
->scrub
->pfss_tsval0
== 0)) {
1847 /* tsval0 MUST be the lowest timestamp */
1848 src
->scrub
->pfss_tsval0
= tsval
;
1851 /* Only fully initialized after a TS gets echoed */
1852 if ((src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0)
1853 src
->scrub
->pfss_flags
|= PFSS_PAWS
;
1857 /* I have a dream.... TCP segment reassembly.... */
1862 pf_normalize_tcpopt(struct pf_rule
*r
, struct mbuf
*m
, struct tcphdr
*th
,
1863 int off
, sa_family_t af
)
1867 int opt
, cnt
, optlen
= 0;
1869 u_char opts
[TCP_MAXOLEN
];
1870 u_char
*optp
= opts
;
1872 thoff
= th
->th_off
<< 2;
1873 cnt
= thoff
- sizeof(struct tcphdr
);
1875 if (cnt
> 0 && !pf_pull_hdr(m
, off
+ sizeof(*th
), opts
, cnt
,
1879 for (; cnt
> 0; cnt
-= optlen
, optp
+= optlen
) {
1881 if (opt
== TCPOPT_EOL
)
1883 if (opt
== TCPOPT_NOP
)
1889 if (optlen
< 2 || optlen
> cnt
)
1894 mss
= (u_int16_t
*)(optp
+ 2);
1895 if ((ntohs(*mss
)) > r
->max_mss
) {
1896 th
->th_sum
= pf_cksum_fixup(th
->th_sum
,
1897 *mss
, htons(r
->max_mss
), 0);
1898 *mss
= htons(r
->max_mss
);
1908 m_copyback(m
, off
+ sizeof(*th
), thoff
- sizeof(*th
), opts
);