2 * Copyright (C) 1995-2002 by Darren Reed.
4 * See the IPFILTER.LICENCE file for details on licencing.
6 * @(#)ip_state.c 1.8 6/5/96 (C) 1993-2000 Darren Reed
7 * @(#)$Id: ip_state.c,v 2.30.2.74 2002/07/27 15:58:10 darrenr Exp $
8 * $FreeBSD: src/sys/contrib/ipfilter/netinet/ip_state.c,v 1.21.2.6 2004/07/04 09:24:39 darrenr Exp $
9 * $DragonFly: src/sys/contrib/ipfilter/netinet/ip_state.c,v 1.10 2005/06/05 12:17:46 corecode Exp $
12 #if defined(__sgi) && (IRIX > 602)
13 # include <sys/ptimers.h>
15 #include <sys/errno.h>
16 #include <sys/types.h>
17 #include <sys/param.h>
19 #if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
21 # include "opt_ipfilter_log.h"
23 #if defined(_KERNEL) && (defined(__DragonFly__) || \
24 (defined(__FreeBSD_version) && \
25 __FreeBSD_version >= 400000)) && !defined(KLD_MODULE)
26 #include "opt_inet6.h"
28 #if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__)
34 # include <linux/kernel.h>
35 # include <linux/module.h>
38 #if (defined(KERNEL) || defined(_KERNEL)) && (defined(__DragonFly__) || __FreeBSD_version >= 220000)
39 # include <sys/filio.h>
40 # include <sys/fcntl.h>
41 # if (defined(__DragonFly__) || __FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
42 # include "opt_ipfilter.h"
45 # include <sys/ioctl.h>
49 # include <sys/protosw.h>
51 #include <sys/socket.h>
52 #if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux)
53 # include <sys/systm.h>
55 #if !defined(__SVR4) && !defined(__svr4__)
57 # include <sys/mbuf.h>
60 # include <sys/filio.h>
61 # include <sys/byteorder.h>
63 # include <sys/dditypes.h>
65 # include <sys/stream.h>
66 # include <sys/kmem.h>
68 #if defined(__DragonFly__) && defined(_KERNEL)
69 # include <sys/thread2.h>
76 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/ip.h>
80 #include <netinet/tcp.h>
82 # include <netinet/ip_var.h>
83 # include <netinet/tcp_fsm.h>
85 #include <netinet/udp.h>
86 #include <netinet/ip_icmp.h>
87 #include "ip_compat.h"
88 #include <netinet/tcpip.h>
94 #include <netinet/icmp6.h>
96 #if defined(__DragonFly__) || (__FreeBSD_version >= 300000)
97 # include <sys/malloc.h>
98 # if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM)
99 # include <sys/libkern.h>
100 # include <sys/systm.h>
104 static const char sccsid
[] = "@(#)ip_state.c 1.8 6/5/96 (C) 1993-2000 Darren Reed";
107 # define MIN(a,b) (((a)<(b))?(a):(b))
110 #define TCP_CLOSE (TH_FIN|TH_RST)
112 static ipstate_t
**ips_table
= NULL
;
113 static int ips_num
= 0;
114 static int ips_wild
= 0;
115 static ips_stat_t ips_stats
;
116 #if (SOLARIS || defined(__sgi)) && defined(_KERNEL)
117 extern KRWLOCK_T ipf_state
, ipf_mutex
;
118 extern kmutex_t ipf_rw
;
122 static frentry_t
*fr_checkicmp6matchingstate (ip6_t
*, fr_info_t
*);
124 static int fr_matchsrcdst (ipstate_t
*, union i6addr
, union i6addr
,
125 fr_info_t
*, tcphdr_t
*);
126 static frentry_t
*fr_checkicmpmatchingstate (ip_t
*, fr_info_t
*);
127 static int fr_matchicmpqueryreply (int, ipstate_t
*, icmphdr_t
*, int);
128 static int fr_state_flush (int, int);
129 static ips_stat_t
*fr_statetstats (void);
130 static void fr_delstate (ipstate_t
*);
131 static int fr_state_remove (caddr_t
);
132 static void fr_ipsmove (ipstate_t
**, ipstate_t
*, u_int
);
133 static int fr_tcpoptions (tcphdr_t
*);
134 int fr_stputent (caddr_t
);
135 int fr_stgetent (caddr_t
);
136 void fr_stinsert (ipstate_t
*);
139 #define FIVE_DAYS (2 * 5 * 86400) /* 5 days: half closed session */
141 #define TCP_MSL 240 /* 2 minutes */
142 u_long fr_tcpidletimeout
= FIVE_DAYS
,
143 fr_tcpclosewait
= 2 * TCP_MSL
,
144 fr_tcplastack
= 2 * TCP_MSL
,
145 fr_tcptimeout
= 2 * TCP_MSL
,
147 fr_tcphalfclosed
= 2 * 2 * 3600, /* 2 hours */
149 fr_udpacktimeout
= 24,
150 fr_icmptimeout
= 120,
151 fr_icmpacktimeout
= 12;
152 int fr_statemax
= IPSTATE_MAX
,
153 fr_statesize
= IPSTATE_SIZE
;
154 int fr_state_doflush
= 0,
156 ipstate_t
*ips_list
= NULL
;
158 static int icmpreplytype4
[ICMP_MAXTYPE
+ 1];
160 static int icmpreplytype6
[ICMP6_MAXTYPE
+ 1];
167 KMALLOCS(ips_table
, ipstate_t
**, fr_statesize
* sizeof(ipstate_t
*));
168 if (ips_table
!= NULL
)
169 bzero((char *)ips_table
, fr_statesize
* sizeof(ipstate_t
*));
173 /* fill icmp reply type table */
174 for (i
= 0; i
<= ICMP_MAXTYPE
; i
++)
175 icmpreplytype4
[i
] = -1;
176 icmpreplytype4
[ICMP_ECHO
] = ICMP_ECHOREPLY
;
177 icmpreplytype4
[ICMP_TSTAMP
] = ICMP_TSTAMPREPLY
;
178 icmpreplytype4
[ICMP_IREQ
] = ICMP_IREQREPLY
;
179 icmpreplytype4
[ICMP_MASKREQ
] = ICMP_MASKREPLY
;
181 /* fill icmp reply type table */
182 for (i
= 0; i
<= ICMP6_MAXTYPE
; i
++)
183 icmpreplytype6
[i
] = -1;
184 icmpreplytype6
[ICMP6_ECHO_REQUEST
] = ICMP6_ECHO_REPLY
;
185 icmpreplytype6
[ICMP6_MEMBERSHIP_QUERY
] = ICMP6_MEMBERSHIP_REPORT
;
186 icmpreplytype6
[ICMP6_NI_QUERY
] = ICMP6_NI_REPLY
;
187 icmpreplytype6
[ND_ROUTER_SOLICIT
] = ND_ROUTER_ADVERT
;
188 icmpreplytype6
[ND_NEIGHBOR_SOLICIT
] = ND_NEIGHBOR_ADVERT
;
195 static ips_stat_t
*fr_statetstats()
197 ips_stats
.iss_active
= ips_num
;
198 ips_stats
.iss_table
= ips_table
;
199 ips_stats
.iss_list
= ips_list
;
205 * flush state tables. two actions currently defined:
206 * which == 0 : flush all state table entries
207 * which == 1 : flush TCP connections which have started to close but are
208 * stuck for some reason.
209 * which == 2 : flush TCP connections which have been idle for a long time,
210 * starting at > 4 days idle and working back in successive half-
211 * days to at most 12 hours old.
213 static int fr_state_flush(which
, proto
)
216 ipstate_t
*is
, **isp
;
217 #if defined(_KERNEL) && !SOLARIS && !defined(__DragonFly__)
220 int delete, removed
= 0, try;
223 for (isp
= &ips_list
; (is
= *isp
); ) {
226 if ((proto
!= 0) && (is
->is_v
!= proto
))
236 if (is
->is_p
!= IPPROTO_TCP
)
238 if ((is
->is_state
[0] != TCPS_ESTABLISHED
) ||
239 (is
->is_state
[1] != TCPS_ESTABLISHED
))
245 if (is
->is_p
== IPPROTO_TCP
)
248 ips_stats
.iss_expire
++;
250 ipstate_log(is
, ISL_FLUSH
);
259 * Asked to remove inactive entries, try again if first attempt
260 * failed. In this case, 86400 is half a day because the counter is
261 * activated every half second.
263 if ((which
== 2) && (removed
== 0)) {
264 try = 86400; /* half a day */
265 for (; (try < FIVE_DAYS
) && (removed
== 0); try += 86400) {
266 for (isp
= &ips_list
; (is
= *isp
); ) {
268 if ((is
->is_p
== IPPROTO_TCP
) &&
269 ((is
->is_state
[0] == TCPS_ESTABLISHED
) ||
270 (is
->is_state
[1] == TCPS_ESTABLISHED
)) &&
271 (is
->is_age
< try)) {
274 } else if ((is
->is_p
!= IPPROTO_TCP
) &&
276 ips_stats
.iss_expire
++;
281 ipstate_log(is
, ISL_FLUSH
);
296 static int fr_state_remove(data
)
303 error
= IRCOPYPTR(data
, (caddr_t
)&st
, sizeof(st
));
307 WRITE_ENTER(&ipf_state
);
308 for (sp
= ips_list
; sp
; sp
= sp
->is_next
)
309 if ((sp
->is_p
== st
.is_p
) && (sp
->is_v
== st
.is_v
) &&
310 !bcmp((char *)&sp
->is_src
, (char *)&st
.is_src
,
311 sizeof(st
.is_src
)) &&
312 !bcmp((char *)&sp
->is_dst
, (char *)&st
.is_dst
,
313 sizeof(st
.is_dst
)) &&
314 !bcmp((char *)&sp
->is_ps
, (char *)&st
.is_ps
,
317 ipstate_log(sp
, ISL_REMOVE
);
320 RWLOCK_EXIT(&ipf_state
);
323 RWLOCK_EXIT(&ipf_state
);
328 int fr_state_ioctl(data
, cmd
, mode
)
330 #if defined(__NetBSD__) || defined(__OpenBSD__)
337 int arg
, ret
, error
= 0;
342 error
= fr_state_remove(data
);
345 error
= IRCOPY(data
, (caddr_t
)&arg
, sizeof(arg
));
348 if (arg
== 0 || arg
== 1) {
349 WRITE_ENTER(&ipf_state
);
350 ret
= fr_state_flush(arg
, 4);
351 RWLOCK_EXIT(&ipf_state
);
352 error
= IWCOPY((caddr_t
)&ret
, data
, sizeof(ret
));
358 error
= IRCOPY(data
, (caddr_t
)&arg
, sizeof(arg
));
361 if (arg
== 0 || arg
== 1) {
362 WRITE_ENTER(&ipf_state
);
363 ret
= fr_state_flush(arg
, 6);
364 RWLOCK_EXIT(&ipf_state
);
365 error
= IWCOPY((caddr_t
)&ret
, data
, sizeof(ret
));
372 if (!(mode
& FWRITE
))
377 tmp
= ipflog_clear(IPL_LOGSTATE
);
378 IWCOPY((char *)&tmp
, data
, sizeof(tmp
));
383 error
= IWCOPYPTR((caddr_t
)fr_statetstats(), data
,
388 arg
= (int)iplused
[IPL_LOGSTATE
];
389 error
= IWCOPY((caddr_t
)&arg
, (caddr_t
)data
, sizeof(arg
));
393 error
= fr_lock(data
, &fr_state_lock
);
396 if (!fr_state_lock
) {
400 error
= fr_stputent(data
);
403 if (!fr_state_lock
) {
407 error
= fr_stgetent(data
);
418 * Copy out state information from the kernel to a user space process.
420 int fr_stgetent(data
)
427 error
= IRCOPYPTR(data
, (caddr_t
)&ips
, sizeof(ips
));
435 if (ips
.ips_next
== NULL
)
441 * Make sure the pointer we're copying from exists in the
442 * current list of entries. Security precaution to prevent
443 * copying of random kernel data.
445 for (is
= ips_list
; is
; is
= is
->is_next
)
451 ips
.ips_next
= isn
->is_next
;
452 bcopy((char *)isn
, (char *)&ips
.ips_is
, sizeof(ips
.ips_is
));
454 bcopy((char *)isn
->is_rule
, (char *)&ips
.ips_fr
,
456 error
= IWCOPYPTR((caddr_t
)&ips
, data
, sizeof(ips
));
463 int fr_stputent(data
)
472 error
= IRCOPYPTR(data
, (caddr_t
)&ips
, sizeof(ips
));
476 KMALLOC(isn
, ipstate_t
*);
480 bcopy((char *)&ips
.ips_is
, (char *)isn
, sizeof(*isn
));
483 if (isn
->is_flags
& FI_NEWFR
) {
484 KMALLOC(fr
, frentry_t
*);
489 bcopy((char *)&ips
.ips_fr
, (char *)fr
, sizeof(*fr
));
490 out
= fr
->fr_flags
& FR_OUTQUE
? 1 : 0;
492 ips
.ips_is
.is_rule
= fr
;
495 * Look up all the interface names in the rule.
497 for (i
= 0; i
< 4; i
++) {
498 name
= fr
->fr_ifnames
[i
];
499 if ((name
[1] == '\0') &&
500 ((name
[0] == '-') || (name
[0] == '*'))) {
501 fr
->fr_ifas
[i
] = NULL
;
502 } else if (*name
!= '\0') {
503 fr
->fr_ifas
[i
] = GETUNIT(name
,
505 if (fr
->fr_ifas
[i
] == NULL
)
506 fr
->fr_ifas
[i
] = (void *)-1;
508 strncpy(isn
->is_ifname
[i
],
509 IFNAME(fr
->fr_ifas
[i
]),
513 isn
->is_ifp
[out
] = fr
->fr_ifas
[i
];
517 * send a copy back to userland of what we ended up
518 * to allow for verification.
520 error
= IWCOPYPTR((caddr_t
)&ips
, data
, sizeof(ips
));
527 for (is
= ips_list
; is
; is
= is
->is_next
)
528 if (is
->is_rule
== fr
)
542 * Insert a state table entry manually.
547 u_int hv
= is
->is_hv
;
551 MUTEX_INIT(&is
->is_lock
, "ipf state entry", NULL
);
554 * Look up all the interface names in the state entry.
556 for (i
= 0; i
< 4; i
++) {
557 name
= is
->is_ifname
[i
];
558 if ((name
[1] == '\0') &&
559 ((name
[0] == '-') || (name
[0] == '*'))) {
560 is
->is_ifp
[0] = NULL
;
561 } else if (*name
!= '\0') {
562 is
->is_ifp
[i
] = GETUNIT(name
, is
->is_v
);
563 if (is
->is_ifp
[i
] == NULL
)
564 is
->is_ifp
[i
] = (void *)-1;
570 * add into list table.
573 ips_list
->is_pnext
= &is
->is_next
;
574 is
->is_pnext
= &ips_list
;
575 is
->is_next
= ips_list
;
578 ips_table
[hv
]->is_phnext
= &is
->is_hnext
;
580 ips_stats
.iss_inuse
++;
581 is
->is_phnext
= ips_table
+ hv
;
582 is
->is_hnext
= ips_table
[hv
];
589 * Create a new ipstate structure and hang it off the hash table.
591 ipstate_t
*fr_addstate(ip
, fin
, stsave
, flags
)
597 tcphdr_t
*tcp
= NULL
;
606 if (fr_state_lock
|| (fin
->fin_off
!= 0) || (fin
->fin_fl
& FI_SHORT
) ||
607 (fin
->fin_misc
& FM_BADSTATE
))
609 if (ips_num
== fr_statemax
) {
611 fr_state_doflush
= 1;
616 bzero((char *)is
, sizeof(*is
));
618 ips
.is_state
[0] = TCPS_CLOSED
;
619 ips
.is_state
[1] = TCPS_CLOSED
;
622 * Copy and calculate...
624 hv
= (is
->is_p
= fin
->fin_fi
.fi_p
);
625 is
->is_src
= fin
->fin_fi
.fi_src
;
627 is
->is_dst
= fin
->fin_fi
.fi_dst
;
630 if (fin
->fin_v
== 6) {
631 if ((is
->is_p
== IPPROTO_ICMPV6
) &&
632 IN6_IS_ADDR_MULTICAST(&is
->is_dst
.in6
)) {
634 * So you can do keep state with neighbour discovery.
639 hv
+= is
->is_dst
.i6
[1];
640 hv
+= is
->is_dst
.i6
[2];
641 hv
+= is
->is_dst
.i6
[3];
643 hv
+= is
->is_src
.i6
[1];
644 hv
+= is
->is_src
.i6
[2];
645 hv
+= is
->is_src
.i6
[3];
654 case IPPROTO_ICMPV6
:
655 ic
= (struct icmp
*)fin
->fin_dp
;
656 if ((ic
->icmp_type
& ICMP6_INFOMSG_MASK
) == 0)
659 switch (ic
->icmp_type
)
661 case ICMP6_ECHO_REQUEST
:
662 is
->is_icmp
.ics_type
= ic
->icmp_type
;
663 hv
+= (is
->is_icmp
.ics_id
= ic
->icmp_id
);
664 hv
+= (is
->is_icmp
.ics_seq
= ic
->icmp_seq
);
666 case ICMP6_MEMBERSHIP_QUERY
:
667 case ND_ROUTER_SOLICIT
:
668 case ND_NEIGHBOR_SOLICIT
:
669 case ICMP6_NI_QUERY
:
670 is
->is_icmp
.ics_type
= ic
->icmp_type
;
675 ATOMIC_INCL(ips_stats
.iss_icmp
);
676 is
->is_age
= fr_icmptimeout
;
680 ic
= (struct icmp
*)fin
->fin_dp
;
682 switch (ic
->icmp_type
)
688 is
->is_icmp
.ics_type
= ic
->icmp_type
;
689 hv
+= (is
->is_icmp
.ics_id
= ic
->icmp_id
);
690 hv
+= (is
->is_icmp
.ics_seq
= ic
->icmp_seq
);
695 ATOMIC_INCL(ips_stats
.iss_icmp
);
696 is
->is_age
= fr_icmptimeout
;
699 tcp
= (tcphdr_t
*)fin
->fin_dp
;
701 if (tcp
->th_flags
& TH_RST
)
704 * The endian of the ports doesn't matter, but the ack and
705 * sequence numbers do as we do mathematics on them later.
707 is
->is_sport
= htons(fin
->fin_data
[0]);
708 is
->is_dport
= htons(fin
->fin_data
[1]);
709 if ((flags
& (FI_W_DPORT
|FI_W_SPORT
)) == 0) {
713 if ((flags
& FI_IGNOREPKT
) == 0) {
714 is
->is_send
= ntohl(tcp
->th_seq
) + fin
->fin_dlen
-
715 (off
= (tcp
->th_off
<< 2)) +
716 ((tcp
->th_flags
& TH_SYN
) ? 1 : 0) +
717 ((tcp
->th_flags
& TH_FIN
) ? 1 : 0);
718 is
->is_maxsend
= is
->is_send
;
720 if ((tcp
->th_flags
& TH_SYN
) &&
721 ((tcp
->th_off
<< 2) >= (sizeof(*tcp
) + 4))) {
722 ws
= fr_tcpoptions(tcp
);
729 is
->is_maxswin
= ntohs(tcp
->th_win
);
730 if (is
->is_maxswin
== 0)
733 if ((tcp
->th_flags
& TH_OPENING
) == TH_SYN
)
737 * If we're creating state for a starting connection, start the
738 * timer on it as we'll never see an error if it fails to
741 ATOMIC_INCL(ips_stats
.iss_tcp
);
745 tcp
= (tcphdr_t
*)fin
->fin_dp
;
747 is
->is_sport
= htons(fin
->fin_data
[0]);
748 is
->is_dport
= htons(fin
->fin_data
[1]);
749 if ((flags
& (FI_W_DPORT
|FI_W_SPORT
)) == 0) {
753 ATOMIC_INCL(ips_stats
.iss_udp
);
754 is
->is_age
= fr_udptimeout
;
757 is
->is_age
= fr_udptimeout
;
761 KMALLOC(is
, ipstate_t
*);
763 ATOMIC_INCL(ips_stats
.iss_nomem
);
766 bcopy((char *)&ips
, (char *)is
, sizeof(*is
));
769 is
->is_rule
= fin
->fin_fr
;
770 if (is
->is_rule
!= NULL
) {
771 is
->is_group
= is
->is_rule
->fr_group
;
772 ATOMIC_INC32(is
->is_rule
->fr_ref
);
773 pass
= is
->is_rule
->fr_flags
;
774 is
->is_frage
[0] = is
->is_rule
->fr_age
[0];
775 is
->is_frage
[1] = is
->is_rule
->fr_age
[1];
776 if (is
->is_frage
[0] != 0)
777 is
->is_age
= is
->is_frage
[0];
779 is
->is_ifp
[(out
<< 1) + 1] = is
->is_rule
->fr_ifas
[1];
780 is
->is_ifp
[(1 - out
) << 1] = is
->is_rule
->fr_ifas
[2];
781 is
->is_ifp
[((1 - out
) << 1) + 1] = is
->is_rule
->fr_ifas
[3];
783 if (((ifp
= is
->is_rule
->fr_ifas
[1]) != NULL
) &&
785 strncpy(is
->is_ifname
[(out
<< 1) + 1],
786 IFNAME(ifp
), IFNAMSIZ
);
787 if (((ifp
= is
->is_rule
->fr_ifas
[2]) != NULL
) &&
789 strncpy(is
->is_ifname
[(1 - out
) << 1],
790 IFNAME(ifp
), IFNAMSIZ
);
791 if (((ifp
= is
->is_rule
->fr_ifas
[3]) != NULL
) &&
793 strncpy(is
->is_ifname
[((1 - out
) << 1) + 1],
794 IFNAME(ifp
), IFNAMSIZ
);
798 is
->is_ifp
[out
<< 1] = fin
->fin_ifp
;
799 strncpy(is
->is_ifname
[out
<< 1], IFNAME(fin
->fin_ifp
), IFNAMSIZ
);
801 WRITE_ENTER(&ipf_state
);
804 if ((flags
& FI_IGNOREPKT
) == 0) {
806 is
->is_bytes
= fin
->fin_dlen
+ fin
->fin_hlen
;
809 * We want to check everything that is a property of this packet,
810 * but we don't (automatically) care about it's fragment status as
813 is
->is_v
= fin
->fin_v
;
814 is
->is_rulen
= fin
->fin_rule
;
815 is
->is_opt
= fin
->fin_fi
.fi_optmsk
;
816 is
->is_optmsk
= 0xffffffff;
817 is
->is_sec
= fin
->fin_fi
.fi_secmsk
;
818 is
->is_secmsk
= 0xffff;
819 is
->is_auth
= fin
->fin_fi
.fi_auth
;
820 is
->is_authmsk
= 0xffff;
821 is
->is_flags
= fin
->fin_fl
& FI_CMP
;
822 is
->is_flags
|= FI_CMP
<< 4;
823 is
->is_flags
|= flags
& (FI_WILDP
|FI_WILDA
);
824 if (flags
& (FI_WILDP
|FI_WILDA
))
827 if (pass
& FR_LOGFIRST
)
828 is
->is_pass
&= ~(FR_LOGFIRST
|FR_LOG
);
831 if (is
->is_p
== IPPROTO_TCP
) {
832 fr_tcp_age(&is
->is_age
, is
->is_state
, fin
,
833 0, is
->is_fsm
); /* 0 = packet from the source */
836 ipstate_log(is
, ISL_NEW
);
838 RWLOCK_EXIT(&ipf_state
);
839 fin
->fin_rev
= IP6NEQ(is
->is_dst
, fin
->fin_fi
.fi_dst
);
840 if ((fin
->fin_fl
& FI_FRAG
) && (pass
& FR_KEEPFRAG
))
841 ipfr_newfrag(ip
, fin
);
846 static int fr_tcpoptions(tcp
)
852 opt
= (u_char
*) (tcp
+ 1);
853 last
= ((u_char
*)tcp
) + (tcp
->th_off
<< 2);
855 /* If we don't find wscale here, we need to clear it */
858 /* Termination condition picked such that opt[0 .. 2] exist */
859 while ((opt
< last
- 2) && (*opt
!= TCPOPT_EOL
)) {
865 /* Proper length ? */
874 /* Unknown options must be two bytes+ */
888 * check to see if a packet with TCP headers fits within the TCP window.
889 * change timeout depending on whether new packet is a SYN-ACK returning for a
890 * SYN or a RST or FIN which indicate time to close up shop.
892 int fr_tcpstate(is
, fin
, ip
, tcp
)
898 tcp_seq seq
, ack
, end
;
900 tcpdata_t
*fdata
, *tdata
;
907 * Find difference between last checked packet and this packet.
909 source
= IP6EQ(fin
->fin_fi
.fi_src
, is
->is_src
);
910 if (source
&& (ntohs(is
->is_sport
) != fin
->fin_data
[0]))
912 fdata
= &is
->is_tcp
.ts_data
[!source
];
913 tdata
= &is
->is_tcp
.ts_data
[source
];
914 off
= tcp
->th_off
<< 2;
915 seq
= ntohl(tcp
->th_seq
);
916 ack
= ntohl(tcp
->th_ack
);
917 win
= ntohs(tcp
->th_win
);
918 end
= seq
+ fin
->fin_dlen
- off
+
919 ((tcp
->th_flags
& TH_SYN
) ? 1 : 0) +
920 ((tcp
->th_flags
& TH_FIN
) ? 1 : 0);
923 if ((tcp
->th_flags
& TH_SYN
) && (off
>= sizeof(*tcp
) + 4))
924 wscale
= fr_tcpoptions(tcp
);
928 MUTEX_ENTER(&is
->is_lock
);
931 fdata
->td_wscale
= wscale
;
932 else if (wscale
== -2)
933 fdata
->td_wscale
= tdata
->td_wscale
= 0;
934 if (!(tcp
->th_flags
& TH_SYN
))
935 win
<<= fdata
->td_wscale
;
937 if ((fdata
->td_end
== 0) &&
938 (!is
->is_fsm
|| ((tcp
->th_flags
& TH_OPENING
) == TH_OPENING
))) {
940 * Must be a (outgoing) SYN-ACK in reply to a SYN.
943 fdata
->td_maxwin
= 1;
944 fdata
->td_maxend
= end
+ win
;
949 if (!(tcp
->th_flags
& TH_ACK
)) { /* Pretend an ack was sent */
951 } else if (((tcp
->th_flags
& (TH_ACK
|TH_RST
)) == (TH_ACK
|TH_RST
)) &&
953 /* gross hack to get around certain broken tcp stacks */
958 seq
= end
= fdata
->td_end
;
960 maxwin
= tdata
->td_maxwin
;
961 ackskew
= tdata
->td_end
- ack
;
963 #define SEQ_GE(a,b) ((int)((a) - (b)) >= 0)
964 #define SEQ_GT(a,b) ((int)((a) - (b)) > 0)
965 if ((SEQ_GE(fdata
->td_maxend
, end
)) &&
966 (SEQ_GE(seq
, fdata
->td_end
- maxwin
)) &&
967 /* XXX what about big packets */
968 #define MAXACKWINDOW 66000
969 (-ackskew
<= (MAXACKWINDOW
<< tdata
->td_wscale
)) &&
970 ( ackskew
<= (MAXACKWINDOW
<< tdata
->td_wscale
))) {
972 /* if ackskew < 0 then this should be due to fragmented
973 * packets. There is no way to know the length of the
974 * total packet in advance.
975 * We do know the total length from the fragment cache though.
976 * Note however that there might be more sessions with
977 * exactly the same source and destination parameters in the
978 * state cache (and source and destination is the only stuff
979 * that is saved in the fragment cache). Note further that
980 * some TCP connections in the state cache are hashed with
981 * sport and dport as well which makes it not worthwhile to
983 * Thus, when ackskew is negative but still seems to belong
984 * to this session, we bump up the destinations end value.
987 * Nearing end of connection, start timeout.
989 /* source ? 0 : 1 -> !source */
990 if (fr_tcp_age(&is
->is_age
, is
->is_state
, fin
, !source
,
991 (int)is
->is_fsm
) == 0) {
995 /* update max window seen */
996 if (fdata
->td_maxwin
< win
)
997 fdata
->td_maxwin
= win
;
998 if (SEQ_GT(end
, fdata
->td_end
))
1000 if (SEQ_GE(ack
+ win
, tdata
->td_maxend
)) {
1001 tdata
->td_maxend
= ack
+ win
;
1006 ATOMIC_INCL(ips_stats
.iss_hits
);
1010 MUTEX_EXIT(&is
->is_lock
);
1011 if ((ret
== 0) && ((tcp
->th_flags
& TH_OPENING
) != TH_SYN
))
1012 fin
->fin_misc
|= FM_BADSTATE
;
1018 * Match a state table entry against an IP packet.
1020 static int fr_matchsrcdst(is
, src
, dst
, fin
, tcp
)
1022 union i6addr src
, dst
;
1026 int ret
= 0, rev
, out
, flags
, idx
;
1030 rev
= IP6NEQ(is
->is_dst
, dst
);
1033 flags
= is
->is_flags
& (FI_WILDA
|FI_WILDP
);
1038 flags
= is
->is_flags
;
1042 if (!(flags
& FI_W_SPORT
) && (sp
!= is
->is_sport
))
1044 else if (!(flags
& FI_W_DPORT
) && (dp
!= is
->is_dport
))
1049 idx
= (out
<< 1) + rev
;
1051 if ((is
->is_ifp
[idx
] == NULL
&&
1052 (*is
->is_ifname
[idx
] == '\0' || *is
->is_ifname
[idx
] == '*')) ||
1053 is
->is_ifp
[idx
] == ifp
)
1061 if ((IP6EQ(is
->is_dst
, dst
) || (flags
& FI_W_DADDR
)) &&
1062 (IP6EQ(is
->is_src
, src
) || (flags
& FI_W_SADDR
)) &&
1063 (!tcp
|| ((sp
== is
->is_sport
|| flags
& FI_W_SPORT
) &&
1064 (dp
== is
->is_dport
|| flags
& FI_W_DPORT
)))) {
1068 if ((IP6EQ(is
->is_dst
, src
) || (flags
& FI_W_DADDR
)) &&
1069 (IP6EQ(is
->is_src
, dst
) || (flags
& FI_W_SADDR
)) &&
1070 (!tcp
|| ((sp
== is
->is_dport
|| flags
& FI_W_DPORT
) &&
1071 (dp
== is
->is_sport
|| flags
& FI_W_SPORT
)))) {
1079 * Whether or not this should be here, is questionable, but the aim
1080 * is to get this out of the main line.
1083 flags
= is
->is_flags
& (FI_CMP
|(FI_CMP
<<4));
1085 if (((fin
->fin_fl
& (flags
>> 4)) != (flags
& FI_CMP
)) ||
1086 (fin
->fin_fi
.fi_optmsk
!= is
->is_opt
) ||
1087 (fin
->fin_fi
.fi_secmsk
!= is
->is_sec
) ||
1088 (fin
->fin_fi
.fi_auth
!= is
->is_auth
))
1091 flags
= is
->is_flags
& (FI_WILDA
|FI_WILDP
);
1092 if ((flags
& (FI_W_SADDR
|FI_W_DADDR
))) {
1093 if ((flags
& FI_W_SADDR
) != 0) {
1095 is
->is_src
= fin
->fin_fi
.fi_src
;
1097 is
->is_src
= fin
->fin_fi
.fi_dst
;
1099 } else if ((flags
& FI_W_DADDR
) != 0) {
1101 is
->is_dst
= fin
->fin_fi
.fi_dst
;
1103 is
->is_dst
= fin
->fin_fi
.fi_src
;
1106 is
->is_flags
&= ~(FI_W_SADDR
|FI_W_DADDR
);
1107 if ((is
->is_flags
& (FI_WILDA
|FI_WILDP
)) == 0)
1111 if ((flags
& (FI_W_SPORT
|FI_W_DPORT
))) {
1112 if ((flags
& FI_W_SPORT
) != 0) {
1115 is
->is_send
= htonl(tcp
->th_seq
);
1118 is
->is_send
= htonl(tcp
->th_ack
);
1120 is
->is_maxsend
= is
->is_send
+ 1;
1121 } else if ((flags
& FI_W_DPORT
) != 0) {
1124 is
->is_dend
= htonl(tcp
->th_ack
);
1127 is
->is_dend
= htonl(tcp
->th_seq
);
1129 is
->is_maxdend
= is
->is_dend
+ 1;
1131 is
->is_flags
&= ~(FI_W_SPORT
|FI_W_DPORT
);
1137 if (is
->is_ifp
[idx
] == NULL
&&
1138 (*is
->is_ifname
[idx
] == '\0' || *is
->is_ifname
[idx
] == '*'))
1142 is
->is_ifp
[ret
] = ifp
;
1143 strncpy(is
->is_ifname
[ret
], IFNAME(ifp
),
1144 sizeof(is
->is_ifname
[ret
]));
1151 fr_matchicmpqueryreply(int v
, ipstate_t
*is
, icmphdr_t
*icmp
, int rev
)
1155 * If we matched its type on the way in, then when going out
1156 * it will still be the same type.
1158 if ((!rev
&& (icmp
->icmp_type
== is
->is_type
)) ||
1159 (rev
&& (icmpreplytype4
[is
->is_type
] == icmp
->icmp_type
))) {
1160 if (icmp
->icmp_type
!= ICMP_ECHOREPLY
)
1162 if ((icmp
->icmp_id
== is
->is_icmp
.ics_id
) &&
1163 (icmp
->icmp_seq
== is
->is_icmp
.ics_seq
))
1168 else if (is
->is_v
== 6) {
1169 if ((!rev
&& (icmp
->icmp_type
== is
->is_type
)) ||
1170 (rev
&& (icmpreplytype6
[is
->is_type
] == icmp
->icmp_type
))) {
1171 if (icmp
->icmp_type
!= ICMP6_ECHO_REPLY
)
1173 if ((icmp
->icmp_id
== is
->is_icmp
.ics_id
) &&
1174 (icmp
->icmp_seq
== is
->is_icmp
.ics_seq
))
1182 static frentry_t
*fr_checkicmpmatchingstate(ip
, fin
)
1186 ipstate_t
*is
, **isp
;
1187 u_short sport
, dport
;
1189 u_short savelen
, ohlen
;
1190 union i6addr dst
, src
;
1201 * Does it at least have the return (basic) IP header ?
1202 * Only a basic IP header (no options) should be with
1203 * an ICMP error header.
1205 if (((ip
->ip_v
!= 4) || (ip
->ip_hl
!= 5)) ||
1206 (fin
->fin_plen
< ICMPERR_MINPKTLEN
))
1209 ic
= (struct icmp
*)fin
->fin_dp
;
1210 type
= ic
->icmp_type
;
1212 * If it's not an error type, then return
1214 if ((type
!= ICMP_UNREACH
) && (type
!= ICMP_SOURCEQUENCH
) &&
1215 (type
!= ICMP_REDIRECT
) && (type
!= ICMP_TIMXCEED
) &&
1216 (type
!= ICMP_PARAMPROB
))
1219 oip
= (ip_t
*)((char *)ic
+ ICMPERR_ICMPHLEN
);
1220 ohlen
= oip
->ip_hl
<< 2;
1222 * Check if the at least the old IP header (with options) and
1223 * 8 bytes of payload is present.
1225 if (fin
->fin_plen
< ICMPERR_MAXPKTLEN
+ ohlen
- sizeof(*oip
))
1231 len
= fin
->fin_dlen
- ICMPERR_ICMPHLEN
;
1232 if ((len
<= 0) || (ohlen
> len
))
1236 * Is the buffer big enough for all of it ? It's the size of the IP
1237 * header claimed in the encapsulated part which is of concern. It
1238 * may be too big to be in this buffer but not so big that it's
1239 * outside the ICMP packet, leading to TCP deref's causing problems.
1240 * This is possible because we don't know how big oip_hl is when we
1241 * do the pullup early in fr_check() and thus can't guarantee it is
1250 if ((char *)oip
+ len
> (char *)m
->b_wptr
)
1253 m
= *(mb_t
**)fin
->fin_mp
;
1254 if ((char *)oip
+ len
> (char *)ip
+ m
->m_len
)
1261 * in the IPv4 case we must zero the i6addr union otherwise
1262 * the IP6EQ and IP6NEQ macros produce the wrong results because
1263 * of the 'junk' in the unused part of the union
1265 bzero((char *)&src
, sizeof(src
));
1266 bzero((char *)&dst
, sizeof(dst
));
1267 bzero((char *)&ofin
, sizeof(ofin
));
1269 * We make an fin entry to be able to feed it to
1270 * matchsrcdst. Note that not all fields are encessary
1271 * but this is the cleanest way. Note further that we
1272 * fill in fin_mp such that if someone uses it we'll get
1273 * a kernel panic. fr_matchsrcdst does not use this.
1275 ofin
.fin_ifp
= fin
->fin_ifp
;
1276 ofin
.fin_out
= !fin
->fin_out
;
1280 * watch out here, as ip is in host order and oip in network
1281 * order. Any change we make must be undone afterwards, like
1282 * oip->ip_off - it is still in network byte order so fix it.
1284 savelen
= oip
->ip_len
;
1286 oip
->ip_off
= ntohs(oip
->ip_off
);
1287 (void) fr_makefrip(ohlen
, oip
, &ofin
);
1289 * Reset the short flag here because in fr_matchsrcdst() the flags
1290 * for the current packet (fin_fl) are compared against * those for
1291 * the existing session.
1293 ofin
.fin_fl
&= ~FI_SHORT
;
1296 * Put old values of ip_len and ip_off back as we don't know
1297 * if we have to forward the packet (or process it again.
1299 oip
->ip_len
= savelen
;
1300 oip
->ip_off
= htons(oip
->ip_off
);
1303 ofin
.fin_qfm
= NULL
;
1310 icmp
= (icmphdr_t
*)((char *)oip
+ ohlen
);
1313 * an ICMP error can only be generated as a result of an
1314 * ICMP query, not as the response on an ICMP error
1316 * XXX theoretically ICMP_ECHOREP and the other reply's are
1317 * ICMP query's as well, but adding them here seems strange XXX
1319 if ((icmp
->icmp_type
!= ICMP_ECHO
) &&
1320 (icmp
->icmp_type
!= ICMP_TSTAMP
) &&
1321 (icmp
->icmp_type
!= ICMP_IREQ
) &&
1322 (icmp
->icmp_type
!= ICMP_MASKREQ
))
1326 * perform a lookup of the ICMP packet in the state table
1328 hv
= (pr
= oip
->ip_p
);
1329 src
.in4
= oip
->ip_src
;
1330 hv
+= src
.in4
.s_addr
;
1331 dst
.in4
= oip
->ip_dst
;
1332 hv
+= dst
.in4
.s_addr
;
1333 hv
+= icmp
->icmp_id
;
1334 hv
+= icmp
->icmp_seq
;
1337 READ_ENTER(&ipf_state
);
1338 for (isp
= &ips_table
[hv
]; (is
= *isp
); isp
= &is
->is_hnext
)
1339 if ((is
->is_p
== pr
) && (is
->is_v
== 4) &&
1340 (is
->is_icmppkts
< is
->is_pkts
) &&
1341 fr_matchsrcdst(is
, src
, dst
, &ofin
, NULL
) &&
1342 fr_matchicmpqueryreply(is
->is_v
, is
, icmp
,
1344 ips_stats
.iss_hits
++;
1346 is
->is_bytes
+= ip
->ip_len
;
1350 RWLOCK_EXIT(&ipf_state
);
1355 if (fin
->fin_plen
< ICMPERR_MAXPKTLEN
)
1362 tcp
= (tcphdr_t
*)((char *)oip
+ ohlen
);
1363 dport
= tcp
->th_dport
;
1364 sport
= tcp
->th_sport
;
1366 hv
= (pr
= oip
->ip_p
);
1367 src
.in4
= oip
->ip_src
;
1368 hv
+= src
.in4
.s_addr
;
1369 dst
.in4
= oip
->ip_dst
;
1370 hv
+= dst
.in4
.s_addr
;
1375 READ_ENTER(&ipf_state
);
1376 for (isp
= &ips_table
[hv
]; (is
= *isp
); isp
= &is
->is_hnext
) {
1378 * Only allow this icmp though if the
1379 * encapsulated packet was allowed through the
1380 * other way around. Note that the minimal amount
1381 * of info present does not allow for checking against
1382 * tcp internals such as seq and ack numbers. Only the
1383 * ports are known to be present and can be even if the
1384 * short flag is set.
1386 if ((is
->is_p
== pr
) && (is
->is_v
== 4) &&
1387 (is
->is_icmppkts
< is
->is_pkts
) &&
1388 fr_matchsrcdst(is
, src
, dst
, &ofin
, tcp
)) {
1390 ips_stats
.iss_hits
++;
1392 is
->is_bytes
+= fin
->fin_plen
;
1394 * we deliberately do not touch the timeouts
1395 * for the accompanying state table entry.
1396 * It remains to be seen if that is correct. XXX
1401 RWLOCK_EXIT(&ipf_state
);
1407 * Move a state hash table entry from its old location at is->is_hv to
1408 * its new location, indexed by hv % fr_statesize.
1410 static void fr_ipsmove(isp
, is
, hv
)
1411 ipstate_t
**isp
, *is
;
1418 * Remove the hash from the old location...
1421 is
->is_hnext
->is_phnext
= isp
;
1422 *isp
= is
->is_hnext
;
1423 if (ips_table
[hvm
] == NULL
)
1424 ips_stats
.iss_inuse
--;
1427 * ...and put the hash in the new one.
1429 hvm
= hv
% fr_statesize
;
1431 isp
= &ips_table
[hvm
];
1433 (*isp
)->is_phnext
= &is
->is_hnext
;
1435 ips_stats
.iss_inuse
++;
1436 is
->is_phnext
= isp
;
1437 is
->is_hnext
= *isp
;
1443 * Check if a packet has a registered state.
1445 frentry_t
*fr_checkstate(ip
, fin
)
1449 union i6addr dst
, src
;
1450 ipstate_t
*is
, **isp
;
1452 u_int hv
, hvm
, hlen
, tryagain
, pass
, v
;
1458 if ((ips_list
== NULL
) || (fin
->fin_off
!= 0) || fr_state_lock
||
1459 (fin
->fin_fl
& FI_SHORT
))
1463 hlen
= fin
->fin_hlen
;
1464 tcp
= (tcphdr_t
*)((char *)ip
+ hlen
);
1465 ic
= (struct icmp
*)tcp
;
1466 hv
= (pr
= fin
->fin_fi
.fi_p
);
1467 src
= fin
->fin_fi
.fi_src
;
1468 dst
= fin
->fin_fi
.fi_dst
;
1469 hv
+= src
.in4
.s_addr
;
1470 hv
+= dst
.in4
.s_addr
;
1473 * Search the hash table for matching packet header info.
1474 * At the bottom of this switch statement, the following is expected:
1475 * is == NULL, no lock on ipf_state is held.
1476 * is != NULL, a lock on ipf_state is held.
1478 v
= fin
->fin_fi
.fi_v
;
1481 hv
+= fin
->fin_fi
.fi_src
.i6
[1];
1482 hv
+= fin
->fin_fi
.fi_src
.i6
[2];
1483 hv
+= fin
->fin_fi
.fi_src
.i6
[3];
1485 if ((fin
->fin_p
== IPPROTO_ICMPV6
) &&
1486 IN6_IS_ADDR_MULTICAST(&fin
->fin_fi
.fi_dst
.in6
)) {
1487 hv
-= dst
.in4
.s_addr
;
1489 hv
+= fin
->fin_fi
.fi_dst
.i6
[1];
1490 hv
+= fin
->fin_fi
.fi_dst
.i6
[2];
1491 hv
+= fin
->fin_fi
.fi_dst
.i6
[3];
1499 case IPPROTO_ICMPV6
:
1503 if ((ic
->icmp_type
== ICMP6_ECHO_REQUEST
) ||
1504 (ic
->icmp_type
== ICMP6_ECHO_REPLY
)) {
1509 READ_ENTER(&ipf_state
);
1511 hvm
= hv
% fr_statesize
;
1512 for (isp
= &ips_table
[hvm
]; (is
= *isp
); isp
= &is
->is_hnext
)
1513 if ((is
->is_p
== pr
) && (is
->is_v
== v
) &&
1514 fr_matchsrcdst(is
, src
, dst
, fin
, NULL
) &&
1515 fr_matchicmpqueryreply(v
, is
, ic
, fin
->fin_rev
)) {
1517 if (is
->is_frage
[rev
] != 0)
1518 is
->is_age
= is
->is_frage
[rev
];
1520 is
->is_age
= fr_icmpacktimeout
;
1522 is
->is_age
= fr_icmptimeout
;
1527 if (tryagain
&& !(is
->is_flags
& FI_W_DADDR
)) {
1528 hv
+= fin
->fin_fi
.fi_src
.i6
[0];
1529 hv
+= fin
->fin_fi
.fi_src
.i6
[1];
1530 hv
+= fin
->fin_fi
.fi_src
.i6
[2];
1531 hv
+= fin
->fin_fi
.fi_src
.i6
[3];
1532 fr_ipsmove(isp
, is
, hv
);
1533 MUTEX_DOWNGRADE(&ipf_state
);
1537 RWLOCK_EXIT(&ipf_state
);
1540 * No matching icmp state entry. Perhaps this is a
1541 * response to another state entry.
1543 if ((ips_wild
!= 0) && (v
== 6) && (tryagain
== 0) &&
1544 !IN6_IS_ADDR_MULTICAST(&fin
->fin_fi
.fi_src
.in6
)) {
1545 hv
-= fin
->fin_fi
.fi_src
.i6
[0];
1546 hv
-= fin
->fin_fi
.fi_src
.i6
[1];
1547 hv
-= fin
->fin_fi
.fi_src
.i6
[2];
1548 hv
-= fin
->fin_fi
.fi_src
.i6
[3];
1550 WRITE_ENTER(&ipf_state
);
1554 fr
= fr_checkicmp6matchingstate((ip6_t
*)ip
, fin
);
1565 hvm
= hv
% fr_statesize
;
1566 READ_ENTER(&ipf_state
);
1567 for (isp
= &ips_table
[hvm
]; (is
= *isp
); isp
= &is
->is_hnext
)
1568 if ((is
->is_p
== pr
) && (is
->is_v
== v
) &&
1569 fr_matchsrcdst(is
, src
, dst
, fin
, NULL
) &&
1570 fr_matchicmpqueryreply(v
, is
, ic
, fin
->fin_rev
)) {
1572 if (is
->is_frage
[rev
] != 0)
1573 is
->is_age
= is
->is_frage
[rev
];
1574 else if (fin
->fin_rev
)
1575 is
->is_age
= fr_icmpacktimeout
;
1577 is
->is_age
= fr_icmptimeout
;
1583 RWLOCK_EXIT(&ipf_state
);
1585 * No matching icmp state entry. Perhaps this is a
1586 * response to another state entry.
1588 fr
= fr_checkicmpmatchingstate(ip
, fin
);
1594 * Just plain ignore RST flag set with either FIN or SYN.
1596 if ((tcp
->th_flags
& TH_RST
) &&
1597 ((tcp
->th_flags
& (TH_FIN
|TH_SYN
|TH_RST
)) != TH_RST
))
1601 u_short dport
, sport
;
1603 dport
= tcp
->th_dport
;
1604 sport
= tcp
->th_sport
;
1608 READ_ENTER(&ipf_state
);
1610 hvm
= hv
% fr_statesize
;
1611 for (isp
= &ips_table
[hvm
]; (is
= *isp
); isp
= &is
->is_hnext
)
1612 if ((is
->is_p
== pr
) && (is
->is_v
== v
) &&
1613 fr_matchsrcdst(is
, src
, dst
, fin
, tcp
)) {
1615 if ((pr
== IPPROTO_TCP
)) {
1616 if (!fr_tcpstate(is
, fin
, ip
, tcp
))
1618 } else if ((pr
== IPPROTO_UDP
)) {
1619 if (is
->is_frage
[rev
] != 0)
1620 is
->is_age
= is
->is_frage
[rev
];
1621 else if (fin
->fin_rev
)
1622 is
->is_age
= fr_udpacktimeout
;
1624 is
->is_age
= fr_udptimeout
;
1630 !(is
->is_flags
& (FI_WILDP
|FI_WILDA
))) {
1633 fr_ipsmove(isp
, is
, hv
);
1634 MUTEX_DOWNGRADE(&ipf_state
);
1639 RWLOCK_EXIT(&ipf_state
);
1640 if (!tryagain
&& ips_wild
) {
1644 WRITE_ENTER(&ipf_state
);
1652 READ_ENTER(&ipf_state
);
1653 for (isp
= &ips_table
[hv
]; (is
= *isp
); isp
= &is
->is_hnext
) {
1654 if ((is
->is_p
== pr
) && (is
->is_v
== v
) &&
1655 fr_matchsrcdst(is
, src
, dst
, fin
, NULL
)) {
1657 if (is
->is_frage
[rev
] != 0)
1658 is
->is_age
= is
->is_frage
[rev
];
1660 is
->is_age
= fr_udptimeout
;
1665 RWLOCK_EXIT(&ipf_state
);
1671 ATOMIC_INCL(ips_stats
.iss_miss
);
1675 MUTEX_ENTER(&is
->is_lock
);
1676 is
->is_bytes
+= fin
->fin_plen
;
1677 ips_stats
.iss_hits
++;
1679 MUTEX_EXIT(&is
->is_lock
);
1681 fin
->fin_rule
= is
->is_rulen
;
1683 fin
->fin_group
= fr
->fr_group
;
1684 fin
->fin_icode
= fr
->fr_icode
;
1688 RWLOCK_EXIT(&ipf_state
);
1689 if ((fin
->fin_fl
& FI_FRAG
) && (pass
& FR_KEEPFRAG
))
1690 ipfr_newfrag(ip
, fin
);
1692 if ((tcp
!= NULL
) && (tcp
->th_flags
& TCP_CLOSE
))
1700 * Sync. state entries. If interfaces come or go or just change position,
1703 void ip_statesync(ifp
)
1709 WRITE_ENTER(&ipf_state
);
1710 for (is
= ips_list
; is
; is
= is
->is_next
) {
1711 for (i
= 0; i
< 4; i
++) {
1712 if (is
->is_ifp
[i
] == ifp
) {
1713 is
->is_ifp
[i
] = GETUNIT(is
->is_ifname
[i
],
1716 is
->is_ifp
[i
] = (void *)-1;
1720 RWLOCK_EXIT(&ipf_state
);
1725 * Must always be called with fr_ipfstate held as a write lock.
1727 static void fr_delstate(is
)
1732 if (is
->is_flags
& (FI_WILDP
|FI_WILDA
))
1735 is
->is_next
->is_pnext
= is
->is_pnext
;
1736 *is
->is_pnext
= is
->is_next
;
1738 is
->is_hnext
->is_phnext
= is
->is_phnext
;
1739 *is
->is_phnext
= is
->is_hnext
;
1740 if (ips_table
[is
->is_hv
] == NULL
)
1741 ips_stats
.iss_inuse
--;
1748 if (fr
->fr_ref
== 0) {
1753 MUTEX_DESTROY(&is
->is_lock
);
1761 * Free memory in use by all state info. kept.
1763 void fr_stateunload()
1767 WRITE_ENTER(&ipf_state
);
1768 while ((is
= ips_list
))
1770 ips_stats
.iss_inuse
= 0;
1772 RWLOCK_EXIT(&ipf_state
);
1774 KFREES(ips_table
, fr_statesize
* sizeof(ipstate_t
*));
1780 * Slowly expire held state for thingslike UDP and ICMP. Timeouts are set
1781 * in expectation of this being called twice per second.
1783 void fr_timeoutstate()
1785 ipstate_t
*is
, **isp
;
1786 #if defined(_KERNEL) && !SOLARIS && !defined(__DragonFly__)
1791 WRITE_ENTER(&ipf_state
);
1792 for (isp
= &ips_list
; (is
= *isp
); )
1793 if (is
->is_age
&& !--is
->is_age
) {
1794 if (is
->is_p
== IPPROTO_TCP
)
1795 ips_stats
.iss_fin
++;
1797 ips_stats
.iss_expire
++;
1799 ipstate_log(is
, ISL_EXPIRE
);
1804 if (fr_state_doflush
) {
1805 (void) fr_state_flush(2, 0);
1806 fr_state_doflush
= 0;
1808 RWLOCK_EXIT(&ipf_state
);
1814 * Original idea freom Pradeep Krishnan for use primarily with NAT code.
1815 * (pkrishna@netcom.com)
1817 * Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29:
1819 * - (try to) base state transitions on real evidence only,
1820 * i.e. packets that are sent and have been received by ipfilter;
1821 * diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used.
1823 * - deal with half-closed connections correctly;
1825 * - store the state of the source in state[0] such that ipfstat
1826 * displays the state as source/dest instead of dest/source; the calls
1827 * to fr_tcp_age have been changed accordingly.
1831 * state[0] = state of source (host that initiated connection)
1832 * state[1] = state of dest (host that accepted the connection)
1834 * dir == 0 : a packet from source to dest
1835 * dir == 1 : a packet from dest to source
1838 int fr_tcp_age(age
, state
, fin
, dir
, fsm
)
1844 tcphdr_t
*tcp
= (tcphdr_t
*)fin
->fin_dp
;
1845 u_char flags
= tcp
->th_flags
;
1849 ostate
= state
[1 - dir
];
1851 dlen
= fin
->fin_plen
- fin
->fin_hlen
- (tcp
->th_off
<< 2);
1853 if (flags
& TH_RST
) {
1854 if (!(tcp
->th_flags
& TH_PUSH
) && !dlen
) {
1855 *age
= fr_tcpclosed
;
1856 state
[dir
] = TCPS_CLOSED
;
1858 *age
= fr_tcpclosewait
;
1859 state
[dir
] = TCPS_CLOSE_WAIT
;
1868 case TCPS_CLOSED
: /* 0 */
1869 if ((flags
& TH_OPENING
) == TH_OPENING
) {
1871 * 'dir' received an S and sends SA in response,
1872 * CLOSED -> SYN_RECEIVED
1874 state
[dir
] = TCPS_SYN_RECEIVED
;
1875 newage
= fr_tcptimeout
;
1876 } else if ((flags
& TH_OPENING
) == TH_SYN
) {
1877 /* 'dir' sent S, CLOSED -> SYN_SENT */
1878 state
[dir
] = TCPS_SYN_SENT
;
1879 newage
= fr_tcptimeout
;
1883 * It is apparently possible that a hosts sends two syncs
1884 * before the remote party is able to respond with a SA. In
1885 * such a case the remote server sometimes ACK's the second
1886 * sync, and then responds with a SA. The following code
1887 * is used to prevent this ack from being blocked.
1889 * We do not reset the timeout here to fr_tcptimeout because
1890 * a connection connect timeout does not renew after every
1891 * packet that is sent. We need to set newage to something
1892 * to indicate the packet has passed the check for its flags
1893 * being valid in the TCP FSM.
1895 else if ((ostate
== TCPS_SYN_SENT
) &&
1896 ((flags
& (TH_FIN
|TH_SYN
|TH_RST
|TH_ACK
)) == TH_ACK
)) {
1901 * The next piece of code makes it possible to get
1902 * already established connections into the state table
1903 * after a restart or reload of the filter rules; this
1904 * does not work when a strict 'flags S keep state' is
1905 * used for tcp connections of course, however, use a
1906 * lower time-out so the state disappears quickly if
1907 * the other side does not pick it up.
1910 (flags
& (TH_FIN
|TH_SYN
|TH_RST
|TH_ACK
)) == TH_ACK
) {
1911 /* we saw an A, guess 'dir' is in ESTABLISHED mode */
1912 if (ostate
== TCPS_CLOSED
) {
1913 state
[dir
] = TCPS_ESTABLISHED
;
1914 newage
= fr_tcptimeout
;
1915 } else if (ostate
== TCPS_ESTABLISHED
) {
1916 state
[dir
] = TCPS_ESTABLISHED
;
1917 newage
= fr_tcpidletimeout
;
1921 * TODO: besides regular ACK packets we can have other
1922 * packets as well; it is yet to be determined how we
1923 * should initialize the states in those cases
1927 case TCPS_LISTEN
: /* 1 */
1931 case TCPS_SYN_SENT
: /* 2 */
1932 if ((flags
& ~(TH_ECN
|TH_CWR
)) == TH_SYN
) {
1934 * A retransmitted SYN packet. We do not reset the
1935 * timeout here to fr_tcptimeout because a connection
1936 * connect timeout does not renew after every packet
1937 * that is sent. We need to set newage to something
1938 * to indicate the packet has passed the check for its
1939 * flags being valid in the TCP FSM.
1942 } else if ((flags
& (TH_SYN
|TH_FIN
|TH_ACK
)) == TH_ACK
) {
1944 * We see an A from 'dir' which is in SYN_SENT
1945 * state: 'dir' sent an A in response to an SA
1946 * which it received, SYN_SENT -> ESTABLISHED
1948 state
[dir
] = TCPS_ESTABLISHED
;
1949 newage
= fr_tcpidletimeout
;
1950 } else if (flags
& TH_FIN
) {
1952 * We see an F from 'dir' which is in SYN_SENT
1953 * state and wants to close its side of the
1954 * connection; SYN_SENT -> FIN_WAIT_1
1956 state
[dir
] = TCPS_FIN_WAIT_1
;
1957 newage
= fr_tcpidletimeout
; /* or fr_tcptimeout? */
1958 } else if ((flags
& TH_OPENING
) == TH_OPENING
) {
1960 * We see an SA from 'dir' which is already in
1961 * SYN_SENT state, this means we have a
1962 * simultaneous open; SYN_SENT -> SYN_RECEIVED
1964 state
[dir
] = TCPS_SYN_RECEIVED
;
1965 newage
= fr_tcptimeout
;
1969 case TCPS_SYN_RECEIVED
: /* 3 */
1970 if ((flags
& (TH_SYN
|TH_FIN
|TH_ACK
)) == TH_ACK
) {
1972 * We see an A from 'dir' which was in SYN_RECEIVED
1973 * state so it must now be in established state,
1974 * SYN_RECEIVED -> ESTABLISHED
1976 state
[dir
] = TCPS_ESTABLISHED
;
1977 newage
= fr_tcpidletimeout
;
1978 } else if ((flags
& ~(TH_ECN
|TH_CWR
)) == TH_OPENING
) {
1980 * We see an SA from 'dir' which is already in
1981 * SYN_RECEIVED state.
1983 newage
= fr_tcptimeout
;
1984 } else if (flags
& TH_FIN
) {
1986 * We see an F from 'dir' which is in SYN_RECEIVED
1987 * state and wants to close its side of the connection;
1988 * SYN_RECEIVED -> FIN_WAIT_1
1990 state
[dir
] = TCPS_FIN_WAIT_1
;
1991 newage
= fr_tcpidletimeout
;
1995 case TCPS_ESTABLISHED
: /* 4 */
1996 if (flags
& TH_FIN
) {
1998 * 'dir' closed its side of the connection; this
1999 * gives us a half-closed connection;
2000 * ESTABLISHED -> FIN_WAIT_1
2002 state
[dir
] = TCPS_FIN_WAIT_1
;
2003 newage
= fr_tcphalfclosed
;
2004 } else if (flags
& TH_ACK
) {
2005 /* an ACK, should we exclude other flags here? */
2006 if (ostate
== TCPS_FIN_WAIT_1
) {
2008 * We know the other side did an active close,
2009 * so we are ACKing the recvd FIN packet (does
2010 * the window matching code guarantee this?)
2011 * and go into CLOSE_WAIT state; this gives us
2012 * a half-closed connection
2014 state
[dir
] = TCPS_CLOSE_WAIT
;
2015 newage
= fr_tcphalfclosed
;
2016 } else if (ostate
< TCPS_CLOSE_WAIT
)
2018 * Still a fully established connection,
2021 newage
= fr_tcpidletimeout
;
2025 case TCPS_CLOSE_WAIT
: /* 5 */
2026 if (flags
& TH_FIN
) {
2028 * Application closed and 'dir' sent a FIN, we're now
2029 * going into LAST_ACK state
2031 newage
= fr_tcplastack
;
2032 state
[dir
] = TCPS_LAST_ACK
;
2035 * We remain in CLOSE_WAIT because the other side has
2036 * closed already and we did not close our side yet;
2039 newage
= fr_tcphalfclosed
;
2043 case TCPS_FIN_WAIT_1
: /* 6 */
2044 if ((flags
& TH_ACK
) && ostate
> TCPS_CLOSE_WAIT
) {
2046 * If the other side is not active anymore it has sent
2047 * us a FIN packet that we are ack'ing now with an ACK;
2048 * this means both sides have now closed the connection
2049 * and we go into TIME_WAIT
2052 * XXX: how do we know we really are ACKing the FIN
2053 * packet here? does the window code guarantee that?
2055 state
[dir
] = TCPS_TIME_WAIT
;
2056 newage
= fr_tcptimeout
;
2059 * We closed our side of the connection already but the
2060 * other side is still active (ESTABLISHED/CLOSE_WAIT);
2061 * continue with this half-closed connection
2063 newage
= fr_tcphalfclosed
;
2066 case TCPS_CLOSING
: /* 7 */
2070 case TCPS_LAST_ACK
: /* 8 */
2071 if (flags
& TH_ACK
) {
2072 if ((flags
& TH_PUSH
) || dlen
)
2074 * There is still data to be delivered, reset
2077 newage
= fr_tcplastack
;
2082 * We cannot detect when we go out of LAST_ACK state to CLOSED
2083 * because that is based on the reception of ACK packets;
2084 * ipfilter can only detect that a packet has been sent by a
2089 case TCPS_FIN_WAIT_2
: /* 9 */
2093 case TCPS_TIME_WAIT
: /* 10 */
2094 newage
= fr_tcptimeout
; /* default 4 mins */
2095 /* we're in 2MSL timeout now */
2108 void ipstate_log(is
, type
)
2117 ipsl
.isl_type
= type
;
2118 ipsl
.isl_pkts
= is
->is_pkts
+ is
->is_icmppkts
;
2119 ipsl
.isl_bytes
= is
->is_bytes
;
2120 ipsl
.isl_src
= is
->is_src
;
2121 ipsl
.isl_dst
= is
->is_dst
;
2122 ipsl
.isl_p
= is
->is_p
;
2123 ipsl
.isl_v
= is
->is_v
;
2124 ipsl
.isl_flags
= is
->is_flags
;
2125 ipsl
.isl_rulen
= is
->is_rulen
;
2126 ipsl
.isl_group
= is
->is_group
;
2127 if (ipsl
.isl_p
== IPPROTO_TCP
|| ipsl
.isl_p
== IPPROTO_UDP
) {
2128 ipsl
.isl_sport
= is
->is_sport
;
2129 ipsl
.isl_dport
= is
->is_dport
;
2130 if (ipsl
.isl_p
== IPPROTO_TCP
) {
2131 ipsl
.isl_state
[0] = is
->is_state
[0];
2132 ipsl
.isl_state
[1] = is
->is_state
[1];
2134 } else if (ipsl
.isl_p
== IPPROTO_ICMP
) {
2135 ipsl
.isl_itype
= is
->is_icmp
.ics_type
;
2136 } else if (ipsl
.isl_p
== IPPROTO_ICMPV6
) {
2137 ipsl
.isl_itype
= is
->is_icmp
.ics_type
;
2139 ipsl
.isl_ps
.isl_filler
[0] = 0;
2140 ipsl
.isl_ps
.isl_filler
[1] = 0;
2143 sizes
[0] = sizeof(ipsl
);
2146 if (ipllog(IPL_LOGSTATE
, NULL
, items
, sizes
, types
, 1)) {
2147 ATOMIC_INCL(ips_stats
.iss_logged
);
2149 ATOMIC_INCL(ips_stats
.iss_logfail
);
2156 frentry_t
*fr_checkicmp6matchingstate(ip
, fin
)
2160 ipstate_t
*is
, **isp
;
2161 u_short sport
, dport
;
2163 struct icmp6_hdr
*ic
, *oic
;
2164 union i6addr dst
, src
;
2174 * Does it at least have the return (basic) IP header ?
2175 * Only a basic IP header (no options) should be with
2176 * an ICMP error header.
2178 if ((fin
->fin_v
!= 6) || (fin
->fin_plen
< ICMP6ERR_MINPKTLEN
))
2180 ic
= (struct icmp6_hdr
*)fin
->fin_dp
;
2181 type
= ic
->icmp6_type
;
2183 * If it's not an error type, then return
2185 if ((type
!= ICMP6_DST_UNREACH
) && (type
!= ICMP6_PACKET_TOO_BIG
) &&
2186 (type
!= ICMP6_TIME_EXCEEDED
) && (type
!= ICMP6_PARAM_PROB
))
2189 oip
= (ip6_t
*)((char *)ic
+ ICMPERR_ICMPHLEN
);
2190 if (fin
->fin_plen
< sizeof(*oip
))
2193 if ((oip
->ip6_nxt
!= IPPROTO_TCP
) && (oip
->ip6_nxt
!= IPPROTO_UDP
) &&
2194 (oip
->ip6_nxt
!= IPPROTO_ICMPV6
))
2197 bzero((char *)&ofin
, sizeof(ofin
));
2198 ofin
.fin_out
= !fin
->fin_out
;
2199 ofin
.fin_ifp
= fin
->fin_ifp
;
2203 ofin
.fin_qfm
= NULL
;
2206 * We make a fin entry to be able to feed it to
2207 * matchsrcdst. Note that not all fields are necessary
2208 * but this is the cleanest way. Note further we fill
2209 * in fin_mp such that if someone uses it we'll get
2210 * a kernel panic. fr_matchsrcdst does not use this.
2212 * watch out here, as ip is in host order and oip in network
2213 * order. Any change we make must be undone afterwards.
2215 savelen
= oip
->ip6_plen
;
2216 oip
->ip6_plen
= ip
->ip6_plen
- sizeof(*ip
) - ICMPERR_ICMPHLEN
;
2217 fr_makefrip(sizeof(*oip
), (ip_t
*)oip
, &ofin
);
2218 oip
->ip6_plen
= savelen
;
2220 if (oip
->ip6_nxt
== IPPROTO_ICMPV6
) {
2221 oic
= (struct icmp6_hdr
*)(oip
+ 1);
2223 * an ICMP error can only be generated as a result of an
2224 * ICMP query, not as the response on an ICMP error
2226 * XXX theoretically ICMP_ECHOREP and the other reply's are
2227 * ICMP query's as well, but adding them here seems strange XXX
2229 if (!(oic
->icmp6_type
& ICMP6_INFOMSG_MASK
))
2233 * perform a lookup of the ICMP packet in the state table
2235 hv
= (pr
= oip
->ip6_nxt
);
2236 src
.in6
= oip
->ip6_src
;
2237 hv
+= src
.in4
.s_addr
;
2238 dst
.in6
= oip
->ip6_dst
;
2239 hv
+= dst
.in4
.s_addr
;
2240 hv
+= oic
->icmp6_id
;
2241 hv
+= oic
->icmp6_seq
;
2244 READ_ENTER(&ipf_state
);
2245 for (isp
= &ips_table
[hv
]; (is
= *isp
); isp
= &is
->is_hnext
)
2246 if ((is
->is_p
== pr
) &&
2247 (oic
->icmp6_id
== is
->is_icmp
.ics_id
) &&
2248 (oic
->icmp6_seq
== is
->is_icmp
.ics_seq
) &&
2249 fr_matchsrcdst(is
, src
, dst
, &ofin
, NULL
)) {
2251 * in the state table ICMP query's are stored
2252 * with the type of the corresponding ICMP
2253 * response. Correct here
2255 if (((is
->is_type
== ICMP6_ECHO_REPLY
) &&
2256 (oic
->icmp6_type
== ICMP6_ECHO_REQUEST
)) ||
2257 (is
->is_type
- 1 == oic
->icmp6_type
)) {
2258 ips_stats
.iss_hits
++;
2260 is
->is_bytes
+= fin
->fin_plen
;
2264 RWLOCK_EXIT(&ipf_state
);
2269 tcp
= (tcphdr_t
*)(oip
+ 1);
2270 dport
= tcp
->th_dport
;
2271 sport
= tcp
->th_sport
;
2273 hv
= (pr
= oip
->ip6_nxt
);
2274 src
.in6
= oip
->ip6_src
;
2275 hv
+= src
.in4
.s_addr
;
2279 dst
.in6
= oip
->ip6_dst
;
2280 hv
+= dst
.in4
.s_addr
;
2288 READ_ENTER(&ipf_state
);
2289 for (isp
= &ips_table
[hv
]; (is
= *isp
); isp
= &is
->is_hnext
) {
2291 * Only allow this icmp though if the
2292 * encapsulated packet was allowed through the
2293 * other way around. Note that the minimal amount
2294 * of info present does not allow for checking against
2295 * tcp internals such as seq and ack numbers.
2297 if ((is
->is_p
== pr
) && (is
->is_v
== 6) &&
2298 fr_matchsrcdst(is
, src
, dst
, &ofin
, tcp
)) {
2300 ips_stats
.iss_hits
++;
2302 is
->is_bytes
+= fin
->fin_plen
;
2304 * we deliberately do not touch the timeouts
2305 * for the accompanying state table entry.
2306 * It remains to be seen if that is correct. XXX
2308 RWLOCK_EXIT(&ipf_state
);
2312 RWLOCK_EXIT(&ipf_state
);