2 * Part of Very Secure FTPd
7 * Code to lock down the accessible kernel API in a Linux seccomp filter
8 * sandbox. Works in Ubuntu 11.10 and newer.
11 #include "seccompsandbox.h"
13 #if defined(__linux__) && defined(__x86_64__)
22 #include <netinet/in.h>
23 #include <netinet/tcp.h>
25 #include <sys/fcntl.h>
27 #include <sys/prctl.h>
28 #include <sys/socket.h>
29 #include <sys/types.h>
31 #include <linux/filter.h>
33 #include <asm/unistd.h>
35 /* #define DEBUG_SIGSYS 1 */
37 #ifndef PR_SET_SECCOMP
38 #define PR_SET_SECCOMP 22
41 #ifndef PR_SET_NO_NEW_PRIVS
42 #define PR_SET_NO_NEW_PRIVS 38
46 #define __NR_openat 257
50 #define O_LARGEFILE 00100000
54 #define O_DIRECTORY 00200000
58 #define O_CLOEXEC 002000000
61 #define kMaxSyscalls 100
69 handle_sigsys(int sig
)
75 static const int kOpenFlags
=
76 O_CREAT
|O_EXCL
|O_APPEND
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_LARGEFILE
;
78 static size_t s_syscall_index
;
79 static size_t s_1_arg_validations
;
80 static size_t s_2_arg_validations
;
81 static size_t s_3_arg_validations
;
82 static int s_syscalls
[kMaxSyscalls
];
83 static int s_errnos
[kMaxSyscalls
];
84 static int s_args_1
[kMaxSyscalls
];
85 static int s_vals_1
[kMaxSyscalls
];
86 static int s_args_2
[kMaxSyscalls
];
87 static int s_vals_2
[kMaxSyscalls
];
88 static int s_args_3
[kMaxSyscalls
];
89 static int s_vals_3
[kMaxSyscalls
];
94 if (s_syscall_index
>= kMaxSyscalls
)
96 bug("out of syscall space");
100 bug("negative syscall");
102 s_errnos
[s_syscall_index
] = 0;
103 s_syscalls
[s_syscall_index
++] = nr
;
107 reject_nr(int nr
, int errcode
)
109 if (s_syscall_index
>= kMaxSyscalls
)
111 bug("out of syscall space");
115 bug("negative syscall");
117 if (errcode
< 0 || errcode
> 255)
121 s_errnos
[s_syscall_index
] = errcode
;
122 s_syscalls
[s_syscall_index
++] = nr
;
126 allow_nr_1_arg_match(int nr
, int arg
, int val
)
128 if (s_syscall_index
>= kMaxSyscalls
)
130 bug("out of syscall space");
134 bug("negative syscall");
136 if (arg
< 1 || arg
> 6)
138 bug("arg out of range");
140 s_args_1
[s_syscall_index
] = arg
;
141 s_vals_1
[s_syscall_index
] = val
;
142 s_errnos
[s_syscall_index
] = 0;
143 s_syscalls
[s_syscall_index
++] = nr
;
144 s_1_arg_validations
++;
148 allow_nr_1_arg_mask(int nr
, int arg
, int val
)
150 if (s_syscall_index
>= kMaxSyscalls
)
152 bug("out of syscall space");
156 bug("negative syscall");
158 if (arg
< 1 || arg
> 6)
160 bug("arg out of range");
162 s_args_1
[s_syscall_index
] = 100 + arg
;
163 s_vals_1
[s_syscall_index
] = val
;
164 s_errnos
[s_syscall_index
] = 0;
165 s_syscalls
[s_syscall_index
++] = nr
;
166 s_1_arg_validations
++;
170 allow_nr_2_arg_match(int nr
, int arg1
, int val1
, int arg2
, int val2
)
172 if (s_syscall_index
>= kMaxSyscalls
)
174 bug("out of syscall space");
178 bug("negative syscall");
180 if (arg1
< 1 || arg1
> 6)
182 bug("arg1 out of range");
184 if (arg2
< 1 || arg2
> 6)
186 bug("arg2 out of range");
188 s_args_1
[s_syscall_index
] = arg1
;
189 s_vals_1
[s_syscall_index
] = val1
;
190 s_args_2
[s_syscall_index
] = arg2
;
191 s_vals_2
[s_syscall_index
] = val2
;
192 s_errnos
[s_syscall_index
] = 0;
193 s_syscalls
[s_syscall_index
++] = nr
;
194 s_2_arg_validations
++;
198 allow_nr_2_arg_mask_match(int nr
, int arg1
, int val1
, int arg2
, int val2
)
200 if (s_syscall_index
>= kMaxSyscalls
)
202 bug("out of syscall space");
206 bug("negative syscall");
208 if (arg1
< 1 || arg1
> 6)
210 bug("arg1 out of range");
212 if (arg2
< 1 || arg2
> 6)
214 bug("arg2 out of range");
216 s_args_1
[s_syscall_index
] = 100 + arg1
;
217 s_vals_1
[s_syscall_index
] = val1
;
218 s_args_2
[s_syscall_index
] = arg2
;
219 s_vals_2
[s_syscall_index
] = val2
;
220 s_errnos
[s_syscall_index
] = 0;
221 s_syscalls
[s_syscall_index
++] = nr
;
222 s_2_arg_validations
++;
226 allow_nr_3_arg_match(int nr
, int arg1
, int val1
, int arg2
, int val2
, int arg3
,
229 if (s_syscall_index
>= kMaxSyscalls
)
231 bug("out of syscall space");
235 bug("negative syscall");
237 if (arg1
< 1 || arg1
> 6)
239 bug("arg1 out of range");
241 if (arg2
< 1 || arg2
> 6)
243 bug("arg2 out of range");
245 if (arg3
< 1 || arg3
> 6)
247 bug("arg3 out of range");
249 s_args_1
[s_syscall_index
] = arg1
;
250 s_vals_1
[s_syscall_index
] = val1
;
251 s_args_2
[s_syscall_index
] = arg2
;
252 s_vals_2
[s_syscall_index
] = val2
;
253 s_args_3
[s_syscall_index
] = arg3
;
254 s_vals_3
[s_syscall_index
] = val3
;
255 s_errnos
[s_syscall_index
] = 0;
256 s_syscalls
[s_syscall_index
++] = nr
;
257 s_3_arg_validations
++;
261 seccomp_sandbox_setup_data_connections()
263 allow_nr_3_arg_match(__NR_socket
, 1, PF_INET
, 2, SOCK_STREAM
, 3, IPPROTO_TCP
);
264 allow_nr_3_arg_match(__NR_socket
,
269 allow_nr(__NR_select
);
270 if (tunable_port_enable
)
272 allow_nr(__NR_connect
);
273 allow_nr_2_arg_match(__NR_getsockopt
, 2, SOL_SOCKET
, 3, SO_ERROR
);
274 allow_nr_2_arg_match(__NR_setsockopt
, 2, SOL_SOCKET
, 3, SO_REUSEADDR
);
275 allow_nr_1_arg_match(__NR_fcntl
, 2, F_GETFL
);
276 allow_nr_2_arg_match(__NR_fcntl
, 2, F_SETFL
, 3, O_RDWR
|O_NONBLOCK
);
277 allow_nr_2_arg_match(__NR_fcntl
, 2, F_SETFL
, 3, O_RDWR
);
279 if (tunable_pasv_enable
)
281 allow_nr(__NR_listen
);
282 allow_nr(__NR_accept
);
287 seccomp_sandbox_setup_base()
289 /* Simple reads and writes on existing descriptors. */
291 allow_nr(__NR_write
);
293 /* Needed for memory management. */
294 allow_nr_2_arg_match(__NR_mmap
,
295 3, PROT_READ
|PROT_WRITE
,
296 4, MAP_PRIVATE
|MAP_ANON
);
297 allow_nr_1_arg_mask(__NR_mprotect
, 3, PROT_READ
);
298 allow_nr(__NR_munmap
);
300 /* glibc falls back gracefully if mremap() fails during realloc(). */
301 reject_nr(__NR_mremap
, ENOSYS
);
303 /* Misc simple low-risk calls. */
304 allow_nr(__NR_gettimeofday
); /* Used by logging. */
305 allow_nr(__NR_rt_sigreturn
); /* Used to handle SIGPIPE. */
306 allow_nr(__NR_restart_syscall
);
307 allow_nr(__NR_close
);
309 /* Always need to be able to exit ! */
310 allow_nr(__NR_exit_group
);
314 seccomp_sandbox_init()
316 if (s_syscall_index
!= 0)
318 bug("bad state in seccomp_sandbox_init");
323 seccomp_sandbox_setup_prelogin(const struct vsf_session
* p_sess
)
327 seccomp_sandbox_setup_base();
329 /* Peeking FTP commands from the network. */
330 allow_nr_1_arg_match(__NR_recvfrom
, 4, MSG_PEEK
);
332 /* Misc simple low-risk calls */
333 allow_nr(__NR_nanosleep
); /* Used for bandwidth / login throttling. */
334 allow_nr(__NR_getpid
); /* Used by logging. */
335 allow_nr(__NR_shutdown
); /* Used for QUIT or a timeout. */
336 allow_nr_1_arg_match(__NR_fcntl
, 2, F_GETFL
);
337 /* It's safe to allow O_RDWR in fcntl because these flags cannot be changed.
338 * Also, sockets are O_RDWR.
340 allow_nr_2_arg_mask_match(__NR_fcntl
, 3, kOpenFlags
|O_ACCMODE
, 2, F_SETFL
);
342 /* Config-dependent items follow. */
343 if (tunable_idle_session_timeout
> 0)
345 allow_nr(__NR_rt_sigaction
);
346 allow_nr(__NR_alarm
);
348 if (tunable_xferlog_enable
|| tunable_dual_log_enable
)
350 /* For file locking. */
351 allow_nr_1_arg_match(__NR_fcntl
, 2, F_SETLKW
);
352 allow_nr_1_arg_match(__NR_fcntl
, 2, F_SETLK
);
354 if (tunable_ssl_enable
)
356 allow_nr_1_arg_match(__NR_recvmsg
, 3, 0);
357 allow_nr_2_arg_match(__NR_setsockopt
, 2, IPPROTO_TCP
, 3, TCP_NODELAY
);
359 if (tunable_syslog_enable
)
361 reject_nr(__NR_socket
, EACCES
);
366 seccomp_sandbox_setup_postlogin(const struct vsf_session
* p_sess
)
368 int is_anon
= p_sess
->is_anonymous
;
369 int open_flag
= kOpenFlags
;
370 if (tunable_write_enable
)
372 open_flag
|= O_ACCMODE
;
375 /* Put lstat() first because it is a very hot syscall for large directory
376 * listings. And the current BPF only allows a linear scan of allowed
379 allow_nr(__NR_lstat
);
381 /* Allow all the simple pre-login things and then expand upon them. */
382 seccomp_sandbox_setup_prelogin(p_sess
);
384 /* Simple file descriptor-based operations. */
385 if (tunable_xferlog_enable
|| tunable_dual_log_enable
||
386 tunable_lock_upload_files
)
388 allow_nr_1_arg_match(__NR_fcntl
, 2, F_SETLKW
);
389 allow_nr_1_arg_match(__NR_fcntl
, 2, F_SETLK
);
391 if (tunable_async_abor_enable
)
393 allow_nr_2_arg_match(__NR_fcntl
, 2, F_SETOWN
, 3, vsf_sysutil_getpid());
395 allow_nr_2_arg_match(__NR_setsockopt
, 2, SOL_SOCKET
, 3, SO_KEEPALIVE
);
396 allow_nr_2_arg_match(__NR_setsockopt
, 2, SOL_SOCKET
, 3, SO_LINGER
);
397 allow_nr_2_arg_match(__NR_setsockopt
, 2, IPPROTO_IP
, 3, IP_TOS
);
398 allow_nr(__NR_fstat
);
399 allow_nr(__NR_lseek
);
400 /* Since we use chroot() to restrict filesystem access, we can just blanket
403 allow_nr_1_arg_mask(__NR_open
, 2, open_flag
);
404 allow_nr_1_arg_mask(__NR_openat
, 3, open_flag
);
405 /* Other pathname-based metadata queries. */
407 allow_nr(__NR_readlink
);
408 /* Directory handling: query, change, read. */
409 allow_nr(__NR_getcwd
);
410 allow_nr(__NR_chdir
);
411 allow_nr(__NR_getdents
);
413 allow_nr(__NR_umask
);
415 /* Config-dependent items follow. */
416 if (tunable_use_sendfile
)
418 allow_nr(__NR_sendfile
);
420 if (tunable_idle_session_timeout
> 0 ||
421 tunable_data_connection_timeout
> 0 ||
422 tunable_async_abor_enable
)
424 allow_nr(__NR_rt_sigaction
);
426 if (tunable_idle_session_timeout
> 0 || tunable_data_connection_timeout
> 0)
428 allow_nr(__NR_alarm
);
431 if (tunable_one_process_model
)
433 seccomp_sandbox_setup_data_connections();
434 if (is_anon
&& tunable_chown_uploads
)
436 allow_nr(__NR_fchmod
);
437 allow_nr(__NR_fchown
);
442 /* Need to receieve file descriptors from privileged broker. */
443 allow_nr_1_arg_match(__NR_recvmsg
, 3, 0);
444 if ((is_anon
&& tunable_chown_uploads
) || tunable_ssl_enable
)
446 /* Need to send file descriptors to privileged broker. */
447 allow_nr_1_arg_match(__NR_sendmsg
, 3, 0);
451 if (tunable_syslog_enable
)
453 /* The ability to pass an address spec isn't needed so disable it. We ensure
454 * the 6th arg (socklen) is 0. We could have checked the 5th arg (sockptr)
455 * but I don't know if 64-bit compares work in the kernel filter, so we're
456 * happy to check the socklen arg, which is 32 bits.
458 allow_nr_1_arg_match(__NR_sendto
, 6, 0);
461 if (tunable_text_userdb_names
)
463 reject_nr(__NR_socket
, EACCES
);
464 allow_nr_2_arg_match(__NR_mmap
, 3, PROT_READ
, 4, MAP_SHARED
);
467 if (tunable_write_enable
)
469 if (!is_anon
|| tunable_anon_mkdir_write_enable
)
471 allow_nr(__NR_mkdir
);
474 tunable_anon_other_write_enable
||
475 tunable_delete_failed_uploads
)
477 allow_nr(__NR_unlink
);
479 if (!is_anon
|| tunable_anon_other_write_enable
)
481 allow_nr(__NR_rmdir
);
482 allow_nr(__NR_rename
);
483 allow_nr(__NR_ftruncate
);
484 if (tunable_mdtm_write
)
486 allow_nr(__NR_utime
);
487 allow_nr(__NR_utimes
);
490 if (!is_anon
&& tunable_chmod_enable
)
492 allow_nr(__NR_chmod
);
498 seccomp_sandbox_setup_postlogin_broker()
500 seccomp_sandbox_setup_base();
501 seccomp_sandbox_setup_data_connections();
502 allow_nr_1_arg_match(__NR_sendmsg
, 3, 0);
506 seccomp_sandbox_lockdown()
508 size_t len
= (s_syscall_index
* 2) +
509 (s_1_arg_validations
* 3) +
510 (s_2_arg_validations
* 5) +
511 (s_3_arg_validations
* 7) +
513 struct sock_filter filters
[len
];
514 struct sock_filter
* p_filter
= filters
;
515 struct sock_fprog prog
;
520 prog
.filter
= filters
;
521 /* Validate the syscall architecture. */
522 p_filter
->code
= BPF_LD
+BPF_W
+BPF_ABS
;
525 /* Offset 4 for syscall architecture. */
528 p_filter
->code
= BPF_JMP
+BPF_JEQ
+BPF_K
;
531 /* AUDIT_ARCH_X86_64 */
532 p_filter
->k
= 0xc000003e;
534 p_filter
->code
= BPF_RET
+BPF_K
;
537 /* SECCOMP_RET_KILL */
541 /* Load the syscall number. */
542 p_filter
->code
= BPF_LD
+BPF_W
+BPF_ABS
;
545 /* Offset 0 for syscall number. */
549 for (i
= 0; i
< s_syscall_index
; ++i
)
556 else if (s_args_2
[i
])
560 else if (s_args_1
[i
])
564 /* Check for syscall number match. */
565 p_filter
->code
= BPF_JMP
+BPF_JEQ
+BPF_K
;
567 p_filter
->jf
= block_size
;
568 p_filter
->k
= s_syscalls
[i
];
570 /* Check argument matches if necessary. */
573 p_filter
->code
= BPF_LD
+BPF_W
+BPF_ABS
;
576 p_filter
->k
= 16 + ((s_args_3
[i
] - 1) * 8);
578 p_filter
->code
= BPF_JMP
+BPF_JEQ
+BPF_K
;
581 p_filter
->k
= s_vals_3
[i
];
586 p_filter
->code
= BPF_LD
+BPF_W
+BPF_ABS
;
589 p_filter
->k
= 16 + ((s_args_2
[i
] - 1) * 8);
591 p_filter
->code
= BPF_JMP
+BPF_JEQ
+BPF_K
;
594 p_filter
->k
= s_vals_2
[i
];
599 int arg
= s_args_1
[i
];
600 int code
= BPF_JMP
+BPF_JEQ
+BPF_K
;
601 int val
= s_vals_1
[i
];
607 code
= BPF_JMP
+BPF_JSET
+BPF_K
;
612 p_filter
->code
= BPF_LD
+BPF_W
+BPF_ABS
;
615 p_filter
->k
= 16 + ((arg
- 1) * 8);
617 p_filter
->code
= code
;
623 p_filter
->code
= BPF_RET
+BPF_K
;
628 /* SECCOMP_RET_ALLOW */
629 p_filter
->k
= 0x7fff0000;
633 /* SECCOMP_RET_ERRNO */
634 p_filter
->k
= 0x00050000 + s_errnos
[i
];
639 /* We trashed the accumulator so put it back. */
640 p_filter
->code
= BPF_LD
+BPF_W
+BPF_ABS
;
647 /* No "allow" matches so kill. */
648 p_filter
->code
= BPF_RET
+BPF_K
;
652 /* SECCOMP_RET_TRAP */
653 p_filter
->k
= 0x00030000;
655 /* SECCOMP_RET_KILL */
659 ret
= prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0);
664 /* Kernel isn't good enough. */
667 die("prctl PR_SET_NO_NEW_PRIVS");
670 if (!tunable_seccomp_sandbox
)
678 memset(&sa
, '\0', sizeof(sa
));
679 sa
.sa_handler
= handle_sigsys
;
680 sigaction(SIGSYS
, &sa
, NULL
);
684 ret
= prctl(PR_SET_SECCOMP
, 2, &prog
, 0, 0);
689 /* Kernel isn't good enough. */
692 die("prctl PR_SET_SECCOMP failed");
696 #else /* __linux__ && __x86_64__ */
699 seccomp_sandbox_init()
704 seccomp_sandbox_setup_prelogin(const struct vsf_session
* p_sess
)
710 seccomp_sandbox_setup_postlogin(const struct vsf_session
* p_sess
)
716 seccomp_sandbox_setup_postlogin_broker()
721 seccomp_sandbox_lockdown()
725 #endif /* __linux__ && __x86_64__ */