1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include <linux/bpf_insn.h>
8 #include <net/ethernet.h>
10 #include <netinet/ip.h>
11 #include <netinet/ip6.h>
17 #include "alloc-util.h"
18 #include "bpf-firewall.h"
19 #include "bpf-program.h"
21 #include "in-addr-prefix-util.h"
22 #include "memory-util.h"
23 #include "missing_syscall.h"
38 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
40 static int add_lookup_instructions(
47 int r
, addr_offset
, addr_size
;
55 addr_size
= sizeof(uint32_t);
56 addr_offset
= is_ingress
?
57 offsetof(struct iphdr
, saddr
) :
58 offsetof(struct iphdr
, daddr
);
62 addr_size
= 4 * sizeof(uint32_t);
63 addr_offset
= is_ingress
?
64 offsetof(struct ip6_hdr
, ip6_src
.s6_addr
) :
65 offsetof(struct ip6_hdr
, ip6_dst
.s6_addr
);
73 /* Compare IPv4 with one word instruction (32-bit) */
74 struct bpf_insn insn
[] = {
75 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
76 BPF_JMP_IMM(BPF_JNE
, BPF_REG_7
, htobe16(protocol
), 0),
79 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
81 * R1: Pointer to the skb
83 * R3: Destination buffer on the stack (r10 - 4)
84 * R4: Number of bytes to read (4)
87 BPF_MOV64_REG(BPF_REG_1
, BPF_REG_6
),
88 BPF_MOV32_IMM(BPF_REG_2
, addr_offset
),
90 BPF_MOV64_REG(BPF_REG_3
, BPF_REG_10
),
91 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_3
, -addr_size
),
93 BPF_MOV32_IMM(BPF_REG_4
, addr_size
),
94 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_skb_load_bytes
),
97 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
98 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
99 * has to be set to the maximum possible value.
101 * On success, the looked up value is stored in R0. For this application, the actual
102 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
106 BPF_LD_MAP_FD(BPF_REG_1
, map_fd
),
107 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
108 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -addr_size
- sizeof(uint32_t)),
109 BPF_ST_MEM(BPF_W
, BPF_REG_2
, 0, addr_size
* 8),
111 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
112 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 1),
113 BPF_ALU32_IMM(BPF_OR
, BPF_REG_8
, verdict
),
116 /* Jump label fixup */
117 insn
[0].off
= ELEMENTSOF(insn
) - 1;
119 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
128 static int add_instructions_for_ip_any(
135 const struct bpf_insn insn
[] = {
136 BPF_ALU32_IMM(BPF_OR
, BPF_REG_8
, verdict
),
139 r
= bpf_program_add_instructions(p
, insn
, 1);
146 static int bpf_firewall_compile_bpf(
148 const char *prog_name
,
154 const struct bpf_insn pre_insn
[] = {
156 * When the eBPF program is entered, R1 contains the address of the skb.
157 * However, R1-R5 are scratch registers that are not preserved when calling
158 * into kernel functions, so we need to save anything that's supposed to
159 * stay around to R6-R9. Save the skb to R6.
161 BPF_MOV64_REG(BPF_REG_6
, BPF_REG_1
),
164 * Although we cannot access the skb data directly from eBPF programs used in this
165 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
166 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
169 BPF_LDX_MEM(BPF_W
, BPF_REG_7
, BPF_REG_6
, offsetof(struct __sk_buff
, protocol
)),
172 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
173 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
175 BPF_MOV32_IMM(BPF_REG_8
, 0),
179 * The access checkers compiled for the configured allowance and denial lists
180 * write to R8 at runtime. The following code prepares for an early exit that
181 * skip the accounting if the packet is denied.
184 * if (R8 == ACCESS_DENIED)
187 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
188 * is allowed to pass.
190 const struct bpf_insn post_insn
[] = {
191 BPF_MOV64_IMM(BPF_REG_0
, 1),
192 BPF_JMP_IMM(BPF_JNE
, BPF_REG_8
, ACCESS_DENIED
, 1),
193 BPF_MOV64_IMM(BPF_REG_0
, 0),
196 _cleanup_(bpf_program_freep
) BPFProgram
*p
= NULL
;
197 int accounting_map_fd
, r
;
203 accounting_map_fd
= is_ingress
?
204 u
->ip_accounting_ingress_map_fd
:
205 u
->ip_accounting_egress_map_fd
;
208 u
->ipv4_allow_map_fd
>= 0 ||
209 u
->ipv6_allow_map_fd
>= 0 ||
210 u
->ipv4_deny_map_fd
>= 0 ||
211 u
->ipv6_deny_map_fd
>= 0 ||
215 if (accounting_map_fd
< 0 && !access_enabled
) {
220 r
= bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, prog_name
, &p
);
224 r
= bpf_program_add_instructions(p
, pre_insn
, ELEMENTSOF(pre_insn
));
228 if (access_enabled
) {
230 * The simple rule this function translates into eBPF instructions is:
232 * - Access will be granted when an address matches an entry in @list_allow
233 * - Otherwise, access will be denied when an address matches an entry in @list_deny
234 * - Otherwise, access will be granted
237 if (u
->ipv4_deny_map_fd
>= 0) {
238 r
= add_lookup_instructions(p
, u
->ipv4_deny_map_fd
, ETH_P_IP
, is_ingress
, ACCESS_DENIED
);
243 if (u
->ipv6_deny_map_fd
>= 0) {
244 r
= add_lookup_instructions(p
, u
->ipv6_deny_map_fd
, ETH_P_IPV6
, is_ingress
, ACCESS_DENIED
);
249 if (u
->ipv4_allow_map_fd
>= 0) {
250 r
= add_lookup_instructions(p
, u
->ipv4_allow_map_fd
, ETH_P_IP
, is_ingress
, ACCESS_ALLOWED
);
255 if (u
->ipv6_allow_map_fd
>= 0) {
256 r
= add_lookup_instructions(p
, u
->ipv6_allow_map_fd
, ETH_P_IPV6
, is_ingress
, ACCESS_ALLOWED
);
262 r
= add_instructions_for_ip_any(p
, ACCESS_ALLOWED
);
268 r
= add_instructions_for_ip_any(p
, ACCESS_DENIED
);
274 r
= bpf_program_add_instructions(p
, post_insn
, ELEMENTSOF(post_insn
));
278 if (accounting_map_fd
>= 0) {
279 struct bpf_insn insn
[] = {
281 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
282 * The jump label will be fixed up later.
284 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 0),
287 BPF_MOV64_IMM(BPF_REG_0
, MAP_KEY_PACKETS
), /* r0 = 0 */
288 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_0
, -4), /* *(u32 *)(fp - 4) = r0 */
289 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
290 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4), /* r2 = fp - 4 */
291 BPF_LD_MAP_FD(BPF_REG_1
, accounting_map_fd
), /* load map fd to r1 */
292 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
293 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2),
294 BPF_MOV64_IMM(BPF_REG_1
, 1), /* r1 = 1 */
295 BPF_RAW_INSN(BPF_STX
| BPF_XADD
| BPF_DW
, BPF_REG_0
, BPF_REG_1
, 0, 0), /* xadd r0 += r1 */
298 BPF_MOV64_IMM(BPF_REG_0
, MAP_KEY_BYTES
), /* r0 = 1 */
299 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_0
, -4), /* *(u32 *)(fp - 4) = r0 */
300 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
301 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4), /* r2 = fp - 4 */
302 BPF_LD_MAP_FD(BPF_REG_1
, accounting_map_fd
),
303 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
304 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2),
305 BPF_LDX_MEM(BPF_W
, BPF_REG_1
, BPF_REG_6
, offsetof(struct __sk_buff
, len
)), /* r1 = skb->len */
306 BPF_RAW_INSN(BPF_STX
| BPF_XADD
| BPF_DW
, BPF_REG_0
, BPF_REG_1
, 0, 0), /* xadd r0 += r1 */
308 /* Allow the packet to pass */
309 BPF_MOV64_IMM(BPF_REG_0
, 1),
312 /* Jump label fixup */
313 insn
[0].off
= ELEMENTSOF(insn
) - 1;
315 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
322 * Exit from the eBPF program, R0 contains the verdict.
323 * 0 means the packet is denied, 1 means the packet may pass.
325 const struct bpf_insn insn
[] = {
329 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
339 static int bpf_firewall_count_access_items(Set
*prefixes
, size_t *n_ipv4
, size_t *n_ipv6
) {
340 struct in_addr_prefix
*a
;
345 SET_FOREACH(a
, prefixes
)
357 return -EAFNOSUPPORT
;
363 static int bpf_firewall_add_access_items(
369 struct bpf_lpm_trie_key
*key_ipv4
, *key_ipv6
;
370 struct in_addr_prefix
*a
;
371 uint64_t value
= verdict
;
374 key_ipv4
= alloca0(offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t));
375 key_ipv6
= alloca0(offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t) * 4);
377 SET_FOREACH(a
, prefixes
)
381 key_ipv4
->prefixlen
= a
->prefixlen
;
382 memcpy(key_ipv4
->data
, &a
->address
, sizeof(uint32_t));
384 r
= bpf_map_update_element(ipv4_map_fd
, key_ipv4
, &value
);
391 key_ipv6
->prefixlen
= a
->prefixlen
;
392 memcpy(key_ipv6
->data
, &a
->address
, 4 * sizeof(uint32_t));
394 r
= bpf_map_update_element(ipv6_map_fd
, key_ipv6
, &value
);
401 return -EAFNOSUPPORT
;
407 static int bpf_firewall_prepare_access_maps(
410 int *ret_ipv4_map_fd
,
411 int *ret_ipv6_map_fd
,
414 _cleanup_close_
int ipv4_map_fd
= -EBADF
, ipv6_map_fd
= -EBADF
;
415 size_t n_ipv4
= 0, n_ipv6
= 0;
419 assert(ret_ipv4_map_fd
);
420 assert(ret_ipv6_map_fd
);
423 for (p
= u
; p
; p
= UNIT_GET_SLICE(p
)) {
428 cc
= unit_get_cgroup_context(p
);
432 prefixes
= verdict
== ACCESS_ALLOWED
? cc
->ip_address_allow
: cc
->ip_address_deny
;
433 reduced
= verdict
== ACCESS_ALLOWED
? &cc
->ip_address_allow_reduced
: &cc
->ip_address_deny_reduced
;
436 r
= in_addr_prefixes_reduce(prefixes
);
443 bpf_firewall_count_access_items(prefixes
, &n_ipv4
, &n_ipv6
);
445 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
446 * needing CAP_SYS_ADMIN for allocating LPM trie map. */
447 if (in_addr_prefixes_is_any(prefixes
)) {
454 char *name
= strjoina("4_", u
->id
);
455 ipv4_map_fd
= bpf_map_new(
457 BPF_MAP_TYPE_LPM_TRIE
,
458 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t),
467 char *name
= strjoina("6_", u
->id
);
468 ipv6_map_fd
= bpf_map_new(
470 BPF_MAP_TYPE_LPM_TRIE
,
471 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t)*4,
479 for (p
= u
; p
; p
= UNIT_GET_SLICE(p
)) {
482 cc
= unit_get_cgroup_context(p
);
486 r
= bpf_firewall_add_access_items(verdict
== ACCESS_ALLOWED
? cc
->ip_address_allow
: cc
->ip_address_deny
,
487 ipv4_map_fd
, ipv6_map_fd
, verdict
);
492 *ret_ipv4_map_fd
= TAKE_FD(ipv4_map_fd
);
493 *ret_ipv6_map_fd
= TAKE_FD(ipv6_map_fd
);
494 *ret_has_any
= false;
498 static int bpf_firewall_prepare_accounting_maps(Unit
*u
, bool enabled
, int *fd_ingress
, int *fd_egress
) {
506 if (*fd_ingress
< 0) {
507 char *name
= strjoina("I_", u
->id
);
508 r
= bpf_map_new(name
, BPF_MAP_TYPE_ARRAY
, sizeof(int), sizeof(uint64_t), 2, 0);
515 if (*fd_egress
< 0) {
516 char *name
= strjoina("E_", u
->id
);
517 r
= bpf_map_new(name
, BPF_MAP_TYPE_ARRAY
, sizeof(int), sizeof(uint64_t), 2, 0);
525 *fd_ingress
= safe_close(*fd_ingress
);
526 *fd_egress
= safe_close(*fd_egress
);
528 zero(u
->ip_accounting_extra
);
534 int bpf_firewall_compile(Unit
*u
) {
535 const char *ingress_name
= NULL
, *egress_name
= NULL
;
536 bool ip_allow_any
= false, ip_deny_any
= false;
542 cc
= unit_get_cgroup_context(u
);
546 supported
= bpf_firewall_supported();
549 if (supported
== BPF_FIREWALL_UNSUPPORTED
)
550 return log_unit_debug_errno(u
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
551 "bpf-firewall: BPF firewalling not supported, proceeding without.");
552 if (supported
!= BPF_FIREWALL_SUPPORTED_WITH_MULTI
&& u
->type
== UNIT_SLICE
)
553 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
554 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
555 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
556 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
558 return log_unit_debug_errno(u
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
559 "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
561 /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
563 if (supported
== BPF_FIREWALL_SUPPORTED_WITH_MULTI
) {
564 ingress_name
= "sd_fw_ingress";
565 egress_name
= "sd_fw_egress";
568 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
569 * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
570 * configuration, but we don't flush out the accounting unnecessarily */
572 u
->ip_bpf_ingress
= bpf_program_free(u
->ip_bpf_ingress
);
573 u
->ip_bpf_egress
= bpf_program_free(u
->ip_bpf_egress
);
575 u
->ipv4_allow_map_fd
= safe_close(u
->ipv4_allow_map_fd
);
576 u
->ipv4_deny_map_fd
= safe_close(u
->ipv4_deny_map_fd
);
578 u
->ipv6_allow_map_fd
= safe_close(u
->ipv6_allow_map_fd
);
579 u
->ipv6_deny_map_fd
= safe_close(u
->ipv6_deny_map_fd
);
581 if (u
->type
!= UNIT_SLICE
) {
582 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
583 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
584 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
585 * means that all configure IP access rules *will* take effect on processes, even though we never
586 * compile them for inner nodes. */
588 r
= bpf_firewall_prepare_access_maps(u
, ACCESS_ALLOWED
, &u
->ipv4_allow_map_fd
, &u
->ipv6_allow_map_fd
, &ip_allow_any
);
590 return log_unit_error_errno(u
, r
, "bpf-firewall: Preparation of BPF allow maps failed: %m");
592 r
= bpf_firewall_prepare_access_maps(u
, ACCESS_DENIED
, &u
->ipv4_deny_map_fd
, &u
->ipv6_deny_map_fd
, &ip_deny_any
);
594 return log_unit_error_errno(u
, r
, "bpf-firewall: Preparation of BPF deny maps failed: %m");
597 r
= bpf_firewall_prepare_accounting_maps(u
, cc
->ip_accounting
, &u
->ip_accounting_ingress_map_fd
, &u
->ip_accounting_egress_map_fd
);
599 return log_unit_error_errno(u
, r
, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
601 r
= bpf_firewall_compile_bpf(u
, ingress_name
, true, &u
->ip_bpf_ingress
, ip_allow_any
, ip_deny_any
);
603 return log_unit_error_errno(u
, r
, "bpf-firewall: Compilation of ingress BPF program failed: %m");
605 r
= bpf_firewall_compile_bpf(u
, egress_name
, false, &u
->ip_bpf_egress
, ip_allow_any
, ip_deny_any
);
607 return log_unit_error_errno(u
, r
, "bpf-firewall: Compilation of egress BPF program failed: %m");
612 static int load_bpf_progs_from_fs_to_set(Unit
*u
, char **filter_paths
, Set
**set
) {
615 STRV_FOREACH(bpf_fs_path
, filter_paths
) {
616 _cleanup_(bpf_program_freep
) BPFProgram
*prog
= NULL
;
619 r
= bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, NULL
, &prog
);
621 return log_unit_error_errno(u
, r
, "bpf-firewall: Allocation of SKB BPF program failed: %m");
623 r
= bpf_program_load_from_bpf_fs(prog
, *bpf_fs_path
);
625 return log_unit_error_errno(u
, r
, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path
);
627 r
= set_ensure_consume(set
, &bpf_program_hash_ops
, TAKE_PTR(prog
));
635 int bpf_firewall_load_custom(Unit
*u
) {
641 cc
= unit_get_cgroup_context(u
);
645 if (!(cc
->ip_filters_ingress
|| cc
->ip_filters_egress
))
648 supported
= bpf_firewall_supported();
652 if (supported
!= BPF_FIREWALL_SUPPORTED_WITH_MULTI
)
653 return log_unit_debug_errno(u
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
654 "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
656 r
= load_bpf_progs_from_fs_to_set(u
, cc
->ip_filters_ingress
, &u
->ip_bpf_custom_ingress
);
659 r
= load_bpf_progs_from_fs_to_set(u
, cc
->ip_filters_egress
, &u
->ip_bpf_custom_egress
);
666 static int attach_custom_bpf_progs(Unit
*u
, const char *path
, int attach_type
, Set
**set
, Set
**set_installed
) {
672 set_clear(*set_installed
);
673 r
= set_ensure_allocated(set_installed
, &bpf_program_hash_ops
);
677 SET_FOREACH_MOVE(prog
, *set_installed
, *set
) {
678 r
= bpf_program_cgroup_attach(prog
, attach_type
, path
, BPF_F_ALLOW_MULTI
);
680 return log_unit_error_errno(u
, r
, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path
);
685 int bpf_firewall_install(Unit
*u
) {
686 _cleanup_(bpf_program_freep
) BPFProgram
*ip_bpf_ingress_uninstall
= NULL
, *ip_bpf_egress_uninstall
= NULL
;
687 _cleanup_free_
char *path
= NULL
;
694 cc
= unit_get_cgroup_context(u
);
699 if (!u
->cgroup_realized
)
702 supported
= bpf_firewall_supported();
705 if (supported
== BPF_FIREWALL_UNSUPPORTED
)
706 return log_unit_debug_errno(u
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
707 "bpf-firewall: BPF firewalling not supported, proceeding without.");
708 if (supported
!= BPF_FIREWALL_SUPPORTED_WITH_MULTI
&& u
->type
== UNIT_SLICE
)
709 return log_unit_debug_errno(u
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
710 "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
711 if (supported
!= BPF_FIREWALL_SUPPORTED_WITH_MULTI
&&
712 (!set_isempty(u
->ip_bpf_custom_ingress
) || !set_isempty(u
->ip_bpf_custom_egress
)))
713 return log_unit_debug_errno(u
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
714 "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
716 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, NULL
, &path
);
718 return log_unit_error_errno(u
, r
, "bpf-firewall: Failed to determine cgroup path: %m");
720 flags
= supported
== BPF_FIREWALL_SUPPORTED_WITH_MULTI
? BPF_F_ALLOW_MULTI
: 0;
722 if (FLAGS_SET(flags
, BPF_F_ALLOW_MULTI
)) {
723 /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
724 * after attaching the new programs, so that there's no time window where neither program is
725 * attached. (There will be a program where both are attached, but that's OK, since this is a
726 * security feature where we rather want to lock down too much than too little */
727 ip_bpf_egress_uninstall
= TAKE_PTR(u
->ip_bpf_egress_installed
);
728 ip_bpf_ingress_uninstall
= TAKE_PTR(u
->ip_bpf_ingress_installed
);
730 /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
731 * detach them) right before attaching the new program, to minimize the time window when we
732 * don't account for IP traffic. */
733 u
->ip_bpf_egress_installed
= bpf_program_free(u
->ip_bpf_egress_installed
);
734 u
->ip_bpf_ingress_installed
= bpf_program_free(u
->ip_bpf_ingress_installed
);
737 if (u
->ip_bpf_egress
) {
738 r
= bpf_program_cgroup_attach(u
->ip_bpf_egress
, BPF_CGROUP_INET_EGRESS
, path
, flags
);
740 return log_unit_error_errno(u
, r
,
741 "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path
);
743 /* Remember that this BPF program is installed now. */
744 u
->ip_bpf_egress_installed
= TAKE_PTR(u
->ip_bpf_egress
);
747 if (u
->ip_bpf_ingress
) {
748 r
= bpf_program_cgroup_attach(u
->ip_bpf_ingress
, BPF_CGROUP_INET_INGRESS
, path
, flags
);
750 return log_unit_error_errno(u
, r
,
751 "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path
);
753 u
->ip_bpf_ingress_installed
= TAKE_PTR(u
->ip_bpf_ingress
);
756 /* And now, definitely get rid of the old programs, and detach them */
757 ip_bpf_egress_uninstall
= bpf_program_free(ip_bpf_egress_uninstall
);
758 ip_bpf_ingress_uninstall
= bpf_program_free(ip_bpf_ingress_uninstall
);
760 r
= attach_custom_bpf_progs(u
, path
, BPF_CGROUP_INET_EGRESS
, &u
->ip_bpf_custom_egress
, &u
->ip_bpf_custom_egress_installed
);
764 r
= attach_custom_bpf_progs(u
, path
, BPF_CGROUP_INET_INGRESS
, &u
->ip_bpf_custom_ingress
, &u
->ip_bpf_custom_ingress_installed
);
771 int bpf_firewall_read_accounting(int map_fd
, uint64_t *ret_bytes
, uint64_t *ret_packets
) {
772 uint64_t key
, packets
;
779 key
= MAP_KEY_PACKETS
;
780 r
= bpf_map_lookup_element(map_fd
, &key
, &packets
);
787 r
= bpf_map_lookup_element(map_fd
, &key
, ret_bytes
);
793 *ret_packets
= packets
;
798 int bpf_firewall_reset_accounting(int map_fd
) {
799 uint64_t key
, value
= 0;
805 key
= MAP_KEY_PACKETS
;
806 r
= bpf_map_update_element(map_fd
, &key
, &value
);
811 return bpf_map_update_element(map_fd
, &key
, &value
);
814 static int bpf_firewall_unsupported_reason
= 0;
816 int bpf_firewall_supported(void) {
817 const struct bpf_insn trivial
[] = {
818 BPF_MOV64_IMM(BPF_REG_0
, 1),
822 _cleanup_(bpf_program_freep
) BPFProgram
*program
= NULL
;
823 static int supported
= -1;
827 /* Checks whether BPF firewalling is supported. For this, we check the following things:
829 * - whether the unified hierarchy is being used
830 * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
831 * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
836 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
838 return log_error_errno(r
, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m");
840 bpf_firewall_unsupported_reason
=
841 log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN
),
842 "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported.");
843 return supported
= BPF_FIREWALL_UNSUPPORTED
;
846 /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
847 r
= bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, NULL
, &program
);
849 bpf_firewall_unsupported_reason
=
850 log_debug_errno(r
, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
851 return supported
= BPF_FIREWALL_UNSUPPORTED
;
854 r
= bpf_program_add_instructions(program
, trivial
, ELEMENTSOF(trivial
));
856 bpf_firewall_unsupported_reason
=
857 log_debug_errno(r
, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
858 return supported
= BPF_FIREWALL_UNSUPPORTED
;
861 r
= bpf_program_load_kernel(program
, NULL
, 0);
863 bpf_firewall_unsupported_reason
=
864 log_debug_errno(r
, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
865 return supported
= BPF_FIREWALL_UNSUPPORTED
;
868 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
869 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
870 * program if we can't do a thing with it later?
872 * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
873 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
874 * parameters are validated however, and that'll fail with EBADF then. */
876 // FIXME: Clang doesn't 0-pad with structured initialization, causing
877 // the kernel to reject the bpf_attr as invalid. See:
878 // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
879 // Ideally it should behave like GCC, so that we can remove these workarounds.
881 attr
.attach_type
= BPF_CGROUP_INET_EGRESS
;
882 attr
.target_fd
= -EBADF
;
883 attr
.attach_bpf_fd
= -EBADF
;
885 if (bpf(BPF_PROG_DETACH
, &attr
, sizeof(attr
)) < 0) {
886 if (errno
!= EBADF
) {
887 bpf_firewall_unsupported_reason
=
888 log_debug_errno(errno
, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
889 return supported
= BPF_FIREWALL_UNSUPPORTED
;
894 bpf_firewall_unsupported_reason
=
895 log_debug_errno(SYNTHETIC_ERRNO(EBADE
),
896 "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
897 "Something is weird, assuming BPF firewalling is broken and hence not supported.");
898 return supported
= BPF_FIREWALL_UNSUPPORTED
;
901 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
902 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
903 * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
904 * get EINVAL if it's not supported, and EBADF as before if it is available.
905 * Use probe result as the indicator that program name is also supported since they both were
906 * added in kernel 4.15. */
909 attr
.attach_type
= BPF_CGROUP_INET_EGRESS
;
910 attr
.target_fd
= -EBADF
;
911 attr
.attach_bpf_fd
= -EBADF
;
912 attr
.attach_flags
= BPF_F_ALLOW_MULTI
;
914 if (bpf(BPF_PROG_ATTACH
, &attr
, sizeof(attr
)) < 0) {
915 if (errno
== EBADF
) {
916 log_debug_errno(errno
, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
917 return supported
= BPF_FIREWALL_SUPPORTED_WITH_MULTI
;
921 log_debug_errno(errno
, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
923 log_debug_errno(errno
, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
925 return supported
= BPF_FIREWALL_SUPPORTED
;
927 bpf_firewall_unsupported_reason
=
928 log_debug_errno(SYNTHETIC_ERRNO(EBADE
),
929 "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
930 "Something is weird, assuming BPF firewalling is broken and hence not supported.");
931 return supported
= BPF_FIREWALL_UNSUPPORTED
;
935 void emit_bpf_firewall_warning(Unit
*u
) {
936 static bool warned
= false;
941 if (warned
|| MANAGER_IS_TEST_RUN(u
->manager
))
944 bool quiet
= ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason
) && detect_container() > 0;
946 log_unit_full_errno(u
, quiet
? LOG_DEBUG
: LOG_WARNING
, bpf_firewall_unsupported_reason
,
947 "unit configures an IP firewall, but %s.\n"
948 "(This warning is only shown for the first unit using IP firewalling.)",
949 getuid() != 0 ? "not running as root" :
950 "the local system does not support BPF/cgroup firewalling");
954 void bpf_firewall_close(Unit
*u
) {
957 u
->ip_accounting_ingress_map_fd
= safe_close(u
->ip_accounting_ingress_map_fd
);
958 u
->ip_accounting_egress_map_fd
= safe_close(u
->ip_accounting_egress_map_fd
);
960 u
->ipv4_allow_map_fd
= safe_close(u
->ipv4_allow_map_fd
);
961 u
->ipv6_allow_map_fd
= safe_close(u
->ipv6_allow_map_fd
);
962 u
->ipv4_deny_map_fd
= safe_close(u
->ipv4_deny_map_fd
);
963 u
->ipv6_deny_map_fd
= safe_close(u
->ipv6_deny_map_fd
);
965 u
->ip_bpf_ingress
= bpf_program_free(u
->ip_bpf_ingress
);
966 u
->ip_bpf_ingress_installed
= bpf_program_free(u
->ip_bpf_ingress_installed
);
967 u
->ip_bpf_egress
= bpf_program_free(u
->ip_bpf_egress
);
968 u
->ip_bpf_egress_installed
= bpf_program_free(u
->ip_bpf_egress_installed
);
970 u
->ip_bpf_custom_ingress
= set_free(u
->ip_bpf_custom_ingress
);
971 u
->ip_bpf_custom_egress
= set_free(u
->ip_bpf_custom_egress
);
972 u
->ip_bpf_custom_ingress_installed
= set_free(u
->ip_bpf_custom_ingress_installed
);
973 u
->ip_bpf_custom_egress_installed
= set_free(u
->ip_bpf_custom_egress_installed
);