src/core/bpf-firewall.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <arpa/inet.h>
   4 #include <assert.h>
   5 #include <errno.h>
   6 #include <fcntl.h>
   7 #include <linux/bpf_insn.h>
   8 #include <net/ethernet.h>
   9 #include <net/if.h>
  10 #include <netinet/ip.h>
  11 #include <netinet/ip6.h>
  12 #include <stddef.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <unistd.h>
  16
  17 #include "alloc-util.h"
  18 #include "bpf-firewall.h"
  19 #include "bpf-program.h"
  20 #include "fd-util.h"
  21 #include "in-addr-prefix-util.h"
  22 #include "memory-util.h"
  23 #include "missing_syscall.h"
  24 #include "unit.h"
  25 #include "strv.h"
  26 #include "virt.h"
  27
  28 enum {
  29         MAP_KEY_PACKETS,
  30         MAP_KEY_BYTES,
  31 };
  32
  33 enum {
  34         ACCESS_ALLOWED = 1,
  35         ACCESS_DENIED  = 2,
  36 };
  37
  38 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
  39
  40 static int add_lookup_instructions(
  41                 BPFProgram *p,
  42                 int map_fd,
  43                 int protocol,
  44                 bool is_ingress,
  45                 int verdict) {
  46
  47         int r, addr_offset, addr_size;
  48
  49         assert(p);
  50         assert(map_fd >= 0);
  51
  52         switch (protocol) {
  53
  54         case ETH_P_IP:
  55                 addr_size = sizeof(uint32_t);
  56                 addr_offset = is_ingress ?
  57                         offsetof(struct iphdr, saddr) :
  58                         offsetof(struct iphdr, daddr);
  59                 break;
  60
  61         case ETH_P_IPV6:
  62                 addr_size = 4 * sizeof(uint32_t);
  63                 addr_offset = is_ingress ?
  64                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
  65                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
  66                 break;
  67
  68         default:
  69                 return -EAFNOSUPPORT;
  70         }
  71
  72         do {
  73                 /* Compare IPv4 with one word instruction (32-bit) */
  74                 struct bpf_insn insn[] = {
  75                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
  76                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
  77
  78                         /*
  79                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
  80                          *
  81                          * R1: Pointer to the skb
  82                          * R2: Data offset
  83                          * R3: Destination buffer on the stack (r10 - 4)
  84                          * R4: Number of bytes to read (4)
  85                          */
  86
  87                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
  88                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
  89
  90                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
  91                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
  92
  93                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
  94                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
  95
  96                         /*
  97                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
  98                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
  99                          * has to be set to the maximum possible value.
 100                          *
 101                          * On success, the looked up value is stored in R0. For this application, the actual
 102                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
 103                          * matching value.
 104                          */
 105
 106                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
 107                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 108                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
 109                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
 110
 111                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 112                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 113                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 114                 };
 115
 116                 /* Jump label fixup */
 117                 insn[0].off = ELEMENTSOF(insn) - 1;
 118
 119                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 120                 if (r < 0)
 121                         return r;
 122
 123         } while (false);
 124
 125         return 0;
 126 }
 127
 128 static int add_instructions_for_ip_any(
 129                 BPFProgram *p,
 130                 int verdict) {
 131         int r;
 132
 133         assert(p);
 134
 135         const struct bpf_insn insn[] = {
 136                 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 137         };
 138
 139         r = bpf_program_add_instructions(p, insn, 1);
 140         if (r < 0)
 141                 return r;
 142
 143         return 0;
 144 }
 145
 146 static int bpf_firewall_compile_bpf(
 147                 Unit *u,
 148                 const char *prog_name,
 149                 bool is_ingress,
 150                 BPFProgram **ret,
 151                 bool ip_allow_any,
 152                 bool ip_deny_any) {
 153
 154         const struct bpf_insn pre_insn[] = {
 155                 /*
 156                  * When the eBPF program is entered, R1 contains the address of the skb.
 157                  * However, R1-R5 are scratch registers that are not preserved when calling
 158                  * into kernel functions, so we need to save anything that's supposed to
 159                  * stay around to R6-R9. Save the skb to R6.
 160                  */
 161                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 162
 163                 /*
 164                  * Although we cannot access the skb data directly from eBPF programs used in this
 165                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
 166                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
 167                  * for later use.
 168                  */
 169                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
 170
 171                 /*
 172                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
 173                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
 174                  */
 175                 BPF_MOV32_IMM(BPF_REG_8, 0),
 176         };
 177
 178         /*
 179          * The access checkers compiled for the configured allowance and denial lists
 180          * write to R8 at runtime. The following code prepares for an early exit that
 181          * skip the accounting if the packet is denied.
 182          *
 183          * R0 = 1
 184          * if (R8 == ACCESS_DENIED)
 185          *     R0 = 0
 186          *
 187          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
 188          * is allowed to pass.
 189          */
 190         const struct bpf_insn post_insn[] = {
 191                 BPF_MOV64_IMM(BPF_REG_0, 1),
 192                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
 193                 BPF_MOV64_IMM(BPF_REG_0, 0),
 194         };
 195
 196         _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
 197         int accounting_map_fd, r;
 198         bool access_enabled;
 199
 200         assert(u);
 201         assert(ret);
 202
 203         accounting_map_fd = is_ingress ?
 204                 u->ip_accounting_ingress_map_fd :
 205                 u->ip_accounting_egress_map_fd;
 206
 207         access_enabled =
 208                 u->ipv4_allow_map_fd >= 0 ||
 209                 u->ipv6_allow_map_fd >= 0 ||
 210                 u->ipv4_deny_map_fd >= 0 ||
 211                 u->ipv6_deny_map_fd >= 0 ||
 212                 ip_allow_any ||
 213                 ip_deny_any;
 214
 215         if (accounting_map_fd < 0 && !access_enabled) {
 216                 *ret = NULL;
 217                 return 0;
 218         }
 219
 220         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
 221         if (r < 0)
 222                 return r;
 223
 224         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
 225         if (r < 0)
 226                 return r;
 227
 228         if (access_enabled) {
 229                 /*
 230                  * The simple rule this function translates into eBPF instructions is:
 231                  *
 232                  * - Access will be granted when an address matches an entry in @list_allow
 233                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
 234                  * - Otherwise, access will be granted
 235                  */
 236
 237                 if (u->ipv4_deny_map_fd >= 0) {
 238                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
 239                         if (r < 0)
 240                                 return r;
 241                 }
 242
 243                 if (u->ipv6_deny_map_fd >= 0) {
 244                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
 245                         if (r < 0)
 246                                 return r;
 247                 }
 248
 249                 if (u->ipv4_allow_map_fd >= 0) {
 250                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
 251                         if (r < 0)
 252                                 return r;
 253                 }
 254
 255                 if (u->ipv6_allow_map_fd >= 0) {
 256                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
 257                         if (r < 0)
 258                                 return r;
 259                 }
 260
 261                 if (ip_allow_any) {
 262                         r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
 263                         if (r < 0)
 264                                 return r;
 265                 }
 266
 267                 if (ip_deny_any) {
 268                         r = add_instructions_for_ip_any(p, ACCESS_DENIED);
 269                         if (r < 0)
 270                                 return r;
 271                 }
 272         }
 273
 274         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
 275         if (r < 0)
 276                 return r;
 277
 278         if (accounting_map_fd >= 0) {
 279                 struct bpf_insn insn[] = {
 280                         /*
 281                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
 282                          * The jump label will be fixed up later.
 283                          */
 284                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
 285
 286                         /* Count packets */
 287                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
 288                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 289                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 290                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 291                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
 292                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 293                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 294                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
 295                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 296
 297                         /* Count bytes */
 298                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
 299                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 300                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 301                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 302                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
 303                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 304                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 305                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
 306                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 307
 308                         /* Allow the packet to pass */
 309                         BPF_MOV64_IMM(BPF_REG_0, 1),
 310                 };
 311
 312                 /* Jump label fixup */
 313                 insn[0].off = ELEMENTSOF(insn) - 1;
 314
 315                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 316                 if (r < 0)
 317                         return r;
 318         }
 319
 320         do {
 321                 /*
 322                  * Exit from the eBPF program, R0 contains the verdict.
 323                  * 0 means the packet is denied, 1 means the packet may pass.
 324                  */
 325                 const struct bpf_insn insn[] = {
 326                         BPF_EXIT_INSN()
 327                 };
 328
 329                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 330                 if (r < 0)
 331                         return r;
 332         } while (false);
 333
 334         *ret = TAKE_PTR(p);
 335
 336         return 0;
 337 }
 338
 339 static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
 340         struct in_addr_prefix *a;
 341
 342         assert(n_ipv4);
 343         assert(n_ipv6);
 344
 345         SET_FOREACH(a, prefixes)
 346                 switch (a->family) {
 347
 348                 case AF_INET:
 349                         (*n_ipv4)++;
 350                         break;
 351
 352                 case AF_INET6:
 353                         (*n_ipv6)++;
 354                         break;
 355
 356                 default:
 357                         return -EAFNOSUPPORT;
 358                 }
 359
 360         return 0;
 361 }
 362
 363 static int bpf_firewall_add_access_items(
 364                 Set *prefixes,
 365                 int ipv4_map_fd,
 366                 int ipv6_map_fd,
 367                 int verdict) {
 368
 369         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
 370         struct in_addr_prefix *a;
 371         uint64_t value = verdict;
 372         int r;
 373
 374         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
 375         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
 376
 377         SET_FOREACH(a, prefixes)
 378                 switch (a->family) {
 379
 380                 case AF_INET:
 381                         key_ipv4->prefixlen = a->prefixlen;
 382                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
 383
 384                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
 385                         if (r < 0)
 386                                 return r;
 387
 388                         break;
 389
 390                 case AF_INET6:
 391                         key_ipv6->prefixlen = a->prefixlen;
 392                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
 393
 394                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
 395                         if (r < 0)
 396                                 return r;
 397
 398                         break;
 399
 400                 default:
 401                         return -EAFNOSUPPORT;
 402                 }
 403
 404         return 0;
 405 }
 406
 407 static int bpf_firewall_prepare_access_maps(
 408                 Unit *u,
 409                 int verdict,
 410                 int *ret_ipv4_map_fd,
 411                 int *ret_ipv6_map_fd,
 412                 bool *ret_has_any) {
 413
 414         _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF;
 415         size_t n_ipv4 = 0, n_ipv6 = 0;
 416         Unit *p;
 417         int r;
 418
 419         assert(ret_ipv4_map_fd);
 420         assert(ret_ipv6_map_fd);
 421         assert(ret_has_any);
 422
 423         for (p = u; p; p = UNIT_GET_SLICE(p)) {
 424                 CGroupContext *cc;
 425                 Set *prefixes;
 426                 bool *reduced;
 427
 428                 cc = unit_get_cgroup_context(p);
 429                 if (!cc)
 430                         continue;
 431
 432                 prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
 433                 reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
 434
 435                 if (!*reduced) {
 436                         r = in_addr_prefixes_reduce(prefixes);
 437                         if (r < 0)
 438                                 return r;
 439
 440                         *reduced = true;
 441                 }
 442
 443                 bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
 444
 445                 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
 446                  * needing CAP_SYS_ADMIN for allocating LPM trie map. */
 447                 if (in_addr_prefixes_is_any(prefixes)) {
 448                         *ret_has_any = true;
 449                         return 0;
 450                 }
 451         }
 452
 453         if (n_ipv4 > 0) {
 454                 char *name = strjoina("4_", u->id);
 455                 ipv4_map_fd = bpf_map_new(
 456                                 name,
 457                                 BPF_MAP_TYPE_LPM_TRIE,
 458                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
 459                                 sizeof(uint64_t),
 460                                 n_ipv4,
 461                                 BPF_F_NO_PREALLOC);
 462                 if (ipv4_map_fd < 0)
 463                         return ipv4_map_fd;
 464         }
 465
 466         if (n_ipv6 > 0) {
 467                 char *name = strjoina("6_", u->id);
 468                 ipv6_map_fd = bpf_map_new(
 469                                 name,
 470                                 BPF_MAP_TYPE_LPM_TRIE,
 471                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
 472                                 sizeof(uint64_t),
 473                                 n_ipv6,
 474                                 BPF_F_NO_PREALLOC);
 475                 if (ipv6_map_fd < 0)
 476                         return ipv6_map_fd;
 477         }
 478
 479         for (p = u; p; p = UNIT_GET_SLICE(p)) {
 480                 CGroupContext *cc;
 481
 482                 cc = unit_get_cgroup_context(p);
 483                 if (!cc)
 484                         continue;
 485
 486                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
 487                                                   ipv4_map_fd, ipv6_map_fd, verdict);
 488                 if (r < 0)
 489                         return r;
 490         }
 491
 492         *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
 493         *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
 494         *ret_has_any = false;
 495         return 0;
 496 }
 497
 498 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
 499         int r;
 500
 501         assert(u);
 502         assert(fd_ingress);
 503         assert(fd_egress);
 504
 505         if (enabled) {
 506                 if (*fd_ingress < 0) {
 507                         char *name = strjoina("I_", u->id);
 508                         r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 509                         if (r < 0)
 510                                 return r;
 511
 512                         *fd_ingress = r;
 513                 }
 514
 515                 if (*fd_egress < 0) {
 516                         char *name = strjoina("E_", u->id);
 517                         r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 518                         if (r < 0)
 519                                 return r;
 520
 521                         *fd_egress = r;
 522                 }
 523
 524         } else {
 525                 *fd_ingress = safe_close(*fd_ingress);
 526                 *fd_egress = safe_close(*fd_egress);
 527
 528                 zero(u->ip_accounting_extra);
 529         }
 530
 531         return 0;
 532 }
 533
 534 int bpf_firewall_compile(Unit *u) {
 535         const char *ingress_name = NULL, *egress_name = NULL;
 536         bool ip_allow_any = false, ip_deny_any = false;
 537         CGroupContext *cc;
 538         int r, supported;
 539
 540         assert(u);
 541
 542         cc = unit_get_cgroup_context(u);
 543         if (!cc)
 544                 return -EINVAL;
 545
 546         supported = bpf_firewall_supported();
 547         if (supported < 0)
 548                 return supported;
 549         if (supported == BPF_FIREWALL_UNSUPPORTED)
 550                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 551                                             "bpf-firewall: BPF firewalling not supported, proceeding without.");
 552         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
 553                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
 554                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
 555                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
 556                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
 557                  * all, either. */
 558                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 559                                             "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
 560
 561         /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
 562          * kernel). */
 563         if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
 564                 ingress_name = "sd_fw_ingress";
 565                 egress_name = "sd_fw_egress";
 566         }
 567
 568         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
 569          * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
 570          * configuration, but we don't flush out the accounting unnecessarily */
 571
 572         u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
 573         u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
 574
 575         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
 576         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
 577
 578         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
 579         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
 580
 581         if (u->type != UNIT_SLICE) {
 582                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
 583                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
 584                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
 585                  * means that all configure IP access rules *will* take effect on processes, even though we never
 586                  * compile them for inner nodes. */
 587
 588                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
 589                 if (r < 0)
 590                         return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
 591
 592                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
 593                 if (r < 0)
 594                         return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
 595         }
 596
 597         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
 598         if (r < 0)
 599                 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
 600
 601         r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
 602         if (r < 0)
 603                 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
 604
 605         r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
 606         if (r < 0)
 607                 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
 608
 609         return 0;
 610 }
 611
 612 static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
 613         set_clear(*set);
 614
 615         STRV_FOREACH(bpf_fs_path, filter_paths) {
 616                 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
 617                 int r;
 618
 619                 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
 620                 if (r < 0)
 621                         return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
 622
 623                 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
 624                 if (r < 0)
 625                         return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
 626
 627                 r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
 628                 if (r < 0)
 629                         return log_oom();
 630         }
 631
 632         return 0;
 633 }
 634
 635 int bpf_firewall_load_custom(Unit *u) {
 636         CGroupContext *cc;
 637         int r, supported;
 638
 639         assert(u);
 640
 641         cc = unit_get_cgroup_context(u);
 642         if (!cc)
 643                 return 0;
 644
 645         if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
 646                 return 0;
 647
 648         supported = bpf_firewall_supported();
 649         if (supported < 0)
 650                 return supported;
 651
 652         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
 653                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 654                                             "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
 655
 656         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
 657         if (r < 0)
 658                 return r;
 659         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
 660         if (r < 0)
 661                 return r;
 662
 663         return 0;
 664 }
 665
 666 static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
 667         BPFProgram *prog;
 668         int r;
 669
 670         assert(u);
 671
 672         set_clear(*set_installed);
 673         r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
 674         if (r < 0)
 675                 return log_oom();
 676
 677         SET_FOREACH_MOVE(prog, *set_installed, *set) {
 678                 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
 679                 if (r < 0)
 680                         return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
 681         }
 682         return 0;
 683 }
 684
 685 int bpf_firewall_install(Unit *u) {
 686         _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
 687         _cleanup_free_ char *path = NULL;
 688         CGroupContext *cc;
 689         int r, supported;
 690         uint32_t flags;
 691
 692         assert(u);
 693
 694         cc = unit_get_cgroup_context(u);
 695         if (!cc)
 696                 return -EINVAL;
 697         if (!u->cgroup_path)
 698                 return -EINVAL;
 699         if (!u->cgroup_realized)
 700                 return -EINVAL;
 701
 702         supported = bpf_firewall_supported();
 703         if (supported < 0)
 704                 return supported;
 705         if (supported == BPF_FIREWALL_UNSUPPORTED)
 706                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 707                                             "bpf-firewall: BPF firewalling not supported, proceeding without.");
 708         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
 709                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 710                                             "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
 711         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
 712             (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
 713                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 714                                             "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
 715
 716         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
 717         if (r < 0)
 718                 return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
 719
 720         flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
 721
 722         if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
 723                 /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
 724                  * after attaching the new programs, so that there's no time window where neither program is
 725                  * attached. (There will be a program where both are attached, but that's OK, since this is a
 726                  * security feature where we rather want to lock down too much than too little */
 727                 ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
 728                 ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
 729         } else {
 730                 /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
 731                  * detach them) right before attaching the new program, to minimize the time window when we
 732                  * don't account for IP traffic. */
 733                 u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
 734                 u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
 735         }
 736
 737         if (u->ip_bpf_egress) {
 738                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
 739                 if (r < 0)
 740                         return log_unit_error_errno(u, r,
 741                                 "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
 742
 743                 /* Remember that this BPF program is installed now. */
 744                 u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
 745         }
 746
 747         if (u->ip_bpf_ingress) {
 748                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
 749                 if (r < 0)
 750                         return log_unit_error_errno(u, r,
 751                                 "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
 752
 753                 u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
 754         }
 755
 756         /* And now, definitely get rid of the old programs, and detach them */
 757         ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
 758         ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
 759
 760         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
 761         if (r < 0)
 762                 return r;
 763
 764         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
 765         if (r < 0)
 766                 return r;
 767
 768         return 0;
 769 }
 770
 771 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
 772         uint64_t key, packets;
 773         int r;
 774
 775         if (map_fd < 0)
 776                 return -EBADF;
 777
 778         if (ret_packets) {
 779                 key = MAP_KEY_PACKETS;
 780                 r = bpf_map_lookup_element(map_fd, &key, &packets);
 781                 if (r < 0)
 782                         return r;
 783         }
 784
 785         if (ret_bytes) {
 786                 key = MAP_KEY_BYTES;
 787                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
 788                 if (r < 0)
 789                         return r;
 790         }
 791
 792         if (ret_packets)
 793                 *ret_packets = packets;
 794
 795         return 0;
 796 }
 797
 798 int bpf_firewall_reset_accounting(int map_fd) {
 799         uint64_t key, value = 0;
 800         int r;
 801
 802         if (map_fd < 0)
 803                 return -EBADF;
 804
 805         key = MAP_KEY_PACKETS;
 806         r = bpf_map_update_element(map_fd, &key, &value);
 807         if (r < 0)
 808                 return r;
 809
 810         key = MAP_KEY_BYTES;
 811         return bpf_map_update_element(map_fd, &key, &value);
 812 }
 813
 814 static int bpf_firewall_unsupported_reason = 0;
 815
 816 int bpf_firewall_supported(void) {
 817         const struct bpf_insn trivial[] = {
 818                 BPF_MOV64_IMM(BPF_REG_0, 1),
 819                 BPF_EXIT_INSN()
 820         };
 821
 822         _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
 823         static int supported = -1;
 824         union bpf_attr attr;
 825         int r;
 826
 827         /* Checks whether BPF firewalling is supported. For this, we check the following things:
 828          *
 829          * - whether the unified hierarchy is being used
 830          * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
 831          * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
 832          */
 833         if (supported >= 0)
 834                 return supported;
 835
 836         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 837         if (r < 0)
 838                 return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m");
 839         if (r == 0) {
 840                 bpf_firewall_unsupported_reason =
 841                         log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
 842                                         "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported.");
 843                 return supported = BPF_FIREWALL_UNSUPPORTED;
 844         }
 845
 846         /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
 847         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
 848         if (r < 0) {
 849                 bpf_firewall_unsupported_reason =
 850                         log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 851                 return supported = BPF_FIREWALL_UNSUPPORTED;
 852         }
 853
 854         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
 855         if (r < 0) {
 856                 bpf_firewall_unsupported_reason =
 857                         log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 858                 return supported = BPF_FIREWALL_UNSUPPORTED;
 859         }
 860
 861         r = bpf_program_load_kernel(program, NULL, 0);
 862         if (r < 0) {
 863                 bpf_firewall_unsupported_reason =
 864                         log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 865                 return supported = BPF_FIREWALL_UNSUPPORTED;
 866         }
 867
 868         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
 869          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
 870          * program if we can't do a thing with it later?
 871          *
 872          * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
 873          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
 874          * parameters are validated however, and that'll fail with EBADF then. */
 875
 876         // FIXME: Clang doesn't 0-pad with structured initialization, causing
 877         // the kernel to reject the bpf_attr as invalid. See:
 878         // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
 879         // Ideally it should behave like GCC, so that we can remove these workarounds.
 880         zero(attr);
 881         attr.attach_type = BPF_CGROUP_INET_EGRESS;
 882         attr.target_fd = -EBADF;
 883         attr.attach_bpf_fd = -EBADF;
 884
 885         if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
 886                 if (errno != EBADF) {
 887                         bpf_firewall_unsupported_reason =
 888                                 log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
 889                         return supported = BPF_FIREWALL_UNSUPPORTED;
 890                 }
 891
 892                 /* YAY! */
 893         } else {
 894                 bpf_firewall_unsupported_reason =
 895                         log_debug_errno(SYNTHETIC_ERRNO(EBADE),
 896                                         "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
 897                                         "Something is weird, assuming BPF firewalling is broken and hence not supported.");
 898                 return supported = BPF_FIREWALL_UNSUPPORTED;
 899         }
 900
 901         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
 902          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
 903          * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
 904          * get EINVAL if it's not supported, and EBADF as before if it is available.
 905          * Use probe result as the indicator that program name is also supported since they both were
 906          * added in kernel 4.15. */
 907
 908         zero(attr);
 909         attr.attach_type = BPF_CGROUP_INET_EGRESS;
 910         attr.target_fd = -EBADF;
 911         attr.attach_bpf_fd = -EBADF;
 912         attr.attach_flags = BPF_F_ALLOW_MULTI;
 913
 914         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
 915                 if (errno == EBADF) {
 916                         log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
 917                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
 918                 }
 919
 920                 if (errno == EINVAL)
 921                         log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
 922                 else
 923                         log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
 924
 925                 return supported = BPF_FIREWALL_SUPPORTED;
 926         } else {
 927                 bpf_firewall_unsupported_reason =
 928                         log_debug_errno(SYNTHETIC_ERRNO(EBADE),
 929                                         "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
 930                                         "Something is weird, assuming BPF firewalling is broken and hence not supported.");
 931                 return supported = BPF_FIREWALL_UNSUPPORTED;
 932         }
 933 }
 934
 935 void emit_bpf_firewall_warning(Unit *u) {
 936         static bool warned = false;
 937
 938         assert(u);
 939         assert(u->manager);
 940
 941         if (warned || MANAGER_IS_TEST_RUN(u->manager))
 942                 return;
 943
 944         bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
 945
 946         log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
 947                             "unit configures an IP firewall, but %s.\n"
 948                             "(This warning is only shown for the first unit using IP firewalling.)",
 949                             getuid() != 0 ? "not running as root" :
 950                             "the local system does not support BPF/cgroup firewalling");
 951         warned = true;
 952 }
 953
 954 void bpf_firewall_close(Unit *u) {
 955         assert(u);
 956
 957         u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
 958         u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
 959
 960         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
 961         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
 962         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
 963         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
 964
 965         u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
 966         u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
 967         u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
 968         u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
 969
 970         u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
 971         u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
 972         u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
 973         u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
 974 }