vsftpd 3.0.3
[tomato.git] / release / src-rt-6.x.4708 / router / vsftpd / seccompsandbox.c
blob2c350a9a9914d2e780e609f45249b61f9cbdc3fc
1 /*
2 * Part of Very Secure FTPd
3 * Licence: GPL v2
4 * Author: Chris Evans
5 * seccompsandbox.c
7 * Code to lock down the accessible kernel API in a Linux seccomp filter
8 * sandbox. Works in Ubuntu 11.10 and newer.
9 */
11 #include "seccompsandbox.h"
13 #if defined(__linux__) && defined(__x86_64__)
15 #include "session.h"
16 #include "sysutil.h"
17 #include "tunables.h"
18 #include "utility.h"
20 #include <errno.h>
22 #include <netinet/in.h>
23 #include <netinet/tcp.h>
25 #include <sys/fcntl.h>
26 #include <sys/mman.h>
27 #include <sys/prctl.h>
28 #include <sys/socket.h>
29 #include <sys/types.h>
31 #include <linux/filter.h>
33 #include <asm/unistd.h>
35 /* #define DEBUG_SIGSYS 1 */
37 #ifndef PR_SET_SECCOMP
38 #define PR_SET_SECCOMP 22
39 #endif
41 #ifndef PR_SET_NO_NEW_PRIVS
42 #define PR_SET_NO_NEW_PRIVS 38
43 #endif
45 #ifndef __NR_openat
46 #define __NR_openat 257
47 #endif
49 #ifndef O_LARGEFILE
50 #define O_LARGEFILE 00100000
51 #endif
53 #ifndef O_DIRECTORY
54 #define O_DIRECTORY 00200000
55 #endif
57 #ifndef O_CLOEXEC
58 #define O_CLOEXEC 002000000
59 #endif
61 #define kMaxSyscalls 100
63 #ifdef DEBUG_SIGSYS
65 #include <signal.h>
66 #include <string.h>
68 void
69 handle_sigsys(int sig)
71 (void) sig;
73 #endif
75 static const int kOpenFlags =
76 O_CREAT|O_EXCL|O_APPEND|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_LARGEFILE;
78 static size_t s_syscall_index;
79 static size_t s_1_arg_validations;
80 static size_t s_2_arg_validations;
81 static size_t s_3_arg_validations;
82 static int s_syscalls[kMaxSyscalls];
83 static int s_errnos[kMaxSyscalls];
84 static int s_args_1[kMaxSyscalls];
85 static int s_vals_1[kMaxSyscalls];
86 static int s_args_2[kMaxSyscalls];
87 static int s_vals_2[kMaxSyscalls];
88 static int s_args_3[kMaxSyscalls];
89 static int s_vals_3[kMaxSyscalls];
91 static void
92 allow_nr(int nr)
94 if (s_syscall_index >= kMaxSyscalls)
96 bug("out of syscall space");
98 if (nr < 0)
100 bug("negative syscall");
102 s_errnos[s_syscall_index] = 0;
103 s_syscalls[s_syscall_index++] = nr;
106 static void
107 reject_nr(int nr, int errcode)
109 if (s_syscall_index >= kMaxSyscalls)
111 bug("out of syscall space");
113 if (nr < 0)
115 bug("negative syscall");
117 if (errcode < 0 || errcode > 255)
119 bug("bad errcode");
121 s_errnos[s_syscall_index] = errcode;
122 s_syscalls[s_syscall_index++] = nr;
125 static void
126 allow_nr_1_arg_match(int nr, int arg, int val)
128 if (s_syscall_index >= kMaxSyscalls)
130 bug("out of syscall space");
132 if (nr < 0)
134 bug("negative syscall");
136 if (arg < 1 || arg > 6)
138 bug("arg out of range");
140 s_args_1[s_syscall_index] = arg;
141 s_vals_1[s_syscall_index] = val;
142 s_errnos[s_syscall_index] = 0;
143 s_syscalls[s_syscall_index++] = nr;
144 s_1_arg_validations++;
147 static void
148 allow_nr_1_arg_mask(int nr, int arg, int val)
150 if (s_syscall_index >= kMaxSyscalls)
152 bug("out of syscall space");
154 if (nr < 0)
156 bug("negative syscall");
158 if (arg < 1 || arg > 6)
160 bug("arg out of range");
162 s_args_1[s_syscall_index] = 100 + arg;
163 s_vals_1[s_syscall_index] = val;
164 s_errnos[s_syscall_index] = 0;
165 s_syscalls[s_syscall_index++] = nr;
166 s_1_arg_validations++;
169 static void
170 allow_nr_2_arg_match(int nr, int arg1, int val1, int arg2, int val2)
172 if (s_syscall_index >= kMaxSyscalls)
174 bug("out of syscall space");
176 if (nr < 0)
178 bug("negative syscall");
180 if (arg1 < 1 || arg1 > 6)
182 bug("arg1 out of range");
184 if (arg2 < 1 || arg2 > 6)
186 bug("arg2 out of range");
188 s_args_1[s_syscall_index] = arg1;
189 s_vals_1[s_syscall_index] = val1;
190 s_args_2[s_syscall_index] = arg2;
191 s_vals_2[s_syscall_index] = val2;
192 s_errnos[s_syscall_index] = 0;
193 s_syscalls[s_syscall_index++] = nr;
194 s_2_arg_validations++;
197 static void
198 allow_nr_2_arg_mask_match(int nr, int arg1, int val1, int arg2, int val2)
200 if (s_syscall_index >= kMaxSyscalls)
202 bug("out of syscall space");
204 if (nr < 0)
206 bug("negative syscall");
208 if (arg1 < 1 || arg1 > 6)
210 bug("arg1 out of range");
212 if (arg2 < 1 || arg2 > 6)
214 bug("arg2 out of range");
216 s_args_1[s_syscall_index] = 100 + arg1;
217 s_vals_1[s_syscall_index] = val1;
218 s_args_2[s_syscall_index] = arg2;
219 s_vals_2[s_syscall_index] = val2;
220 s_errnos[s_syscall_index] = 0;
221 s_syscalls[s_syscall_index++] = nr;
222 s_2_arg_validations++;
225 static void
226 allow_nr_3_arg_match(int nr, int arg1, int val1, int arg2, int val2, int arg3,
227 int val3)
229 if (s_syscall_index >= kMaxSyscalls)
231 bug("out of syscall space");
233 if (nr < 0)
235 bug("negative syscall");
237 if (arg1 < 1 || arg1 > 6)
239 bug("arg1 out of range");
241 if (arg2 < 1 || arg2 > 6)
243 bug("arg2 out of range");
245 if (arg3 < 1 || arg3 > 6)
247 bug("arg3 out of range");
249 s_args_1[s_syscall_index] = arg1;
250 s_vals_1[s_syscall_index] = val1;
251 s_args_2[s_syscall_index] = arg2;
252 s_vals_2[s_syscall_index] = val2;
253 s_args_3[s_syscall_index] = arg3;
254 s_vals_3[s_syscall_index] = val3;
255 s_errnos[s_syscall_index] = 0;
256 s_syscalls[s_syscall_index++] = nr;
257 s_3_arg_validations++;
260 static void
261 seccomp_sandbox_setup_data_connections()
263 allow_nr_3_arg_match(__NR_socket, 1, PF_INET, 2, SOCK_STREAM, 3, IPPROTO_TCP);
264 allow_nr_3_arg_match(__NR_socket,
265 1, PF_INET6,
266 2, SOCK_STREAM,
267 3, IPPROTO_TCP);
268 allow_nr(__NR_bind);
269 allow_nr(__NR_select);
270 if (tunable_port_enable)
272 allow_nr(__NR_connect);
273 allow_nr_2_arg_match(__NR_getsockopt, 2, SOL_SOCKET, 3, SO_ERROR);
274 allow_nr_2_arg_match(__NR_setsockopt, 2, SOL_SOCKET, 3, SO_REUSEADDR);
275 allow_nr_1_arg_match(__NR_fcntl, 2, F_GETFL);
276 allow_nr_2_arg_match(__NR_fcntl, 2, F_SETFL, 3, O_RDWR|O_NONBLOCK);
277 allow_nr_2_arg_match(__NR_fcntl, 2, F_SETFL, 3, O_RDWR);
279 if (tunable_pasv_enable)
281 allow_nr(__NR_listen);
282 allow_nr(__NR_accept);
286 static void
287 seccomp_sandbox_setup_base()
289 /* Simple reads and writes on existing descriptors. */
290 allow_nr(__NR_read);
291 allow_nr(__NR_write);
293 /* Needed for memory management. */
294 allow_nr_2_arg_match(__NR_mmap,
295 3, PROT_READ|PROT_WRITE,
296 4, MAP_PRIVATE|MAP_ANON);
297 allow_nr_1_arg_mask(__NR_mprotect, 3, PROT_READ);
298 allow_nr(__NR_munmap);
299 allow_nr(__NR_brk);
300 /* glibc falls back gracefully if mremap() fails during realloc(). */
301 reject_nr(__NR_mremap, ENOSYS);
303 /* Misc simple low-risk calls. */
304 allow_nr(__NR_gettimeofday); /* Used by logging. */
305 allow_nr(__NR_rt_sigreturn); /* Used to handle SIGPIPE. */
306 allow_nr(__NR_restart_syscall);
307 allow_nr(__NR_close);
309 /* Always need to be able to exit ! */
310 allow_nr(__NR_exit_group);
313 void
314 seccomp_sandbox_init()
316 if (s_syscall_index != 0)
318 bug("bad state in seccomp_sandbox_init");
322 void
323 seccomp_sandbox_setup_prelogin(const struct vsf_session* p_sess)
325 (void) p_sess;
327 seccomp_sandbox_setup_base();
329 /* Peeking FTP commands from the network. */
330 allow_nr_1_arg_match(__NR_recvfrom, 4, MSG_PEEK);
332 /* Misc simple low-risk calls */
333 allow_nr(__NR_nanosleep); /* Used for bandwidth / login throttling. */
334 allow_nr(__NR_getpid); /* Used by logging. */
335 allow_nr(__NR_shutdown); /* Used for QUIT or a timeout. */
336 allow_nr_1_arg_match(__NR_fcntl, 2, F_GETFL);
337 /* It's safe to allow O_RDWR in fcntl because these flags cannot be changed.
338 * Also, sockets are O_RDWR.
340 allow_nr_2_arg_mask_match(__NR_fcntl, 3, kOpenFlags|O_ACCMODE, 2, F_SETFL);
342 /* Config-dependent items follow. */
343 if (tunable_idle_session_timeout > 0)
345 allow_nr(__NR_rt_sigaction);
346 allow_nr(__NR_alarm);
348 if (tunable_xferlog_enable || tunable_dual_log_enable)
350 /* For file locking. */
351 allow_nr_1_arg_match(__NR_fcntl, 2, F_SETLKW);
352 allow_nr_1_arg_match(__NR_fcntl, 2, F_SETLK);
354 if (tunable_ssl_enable)
356 allow_nr_1_arg_match(__NR_recvmsg, 3, 0);
357 allow_nr_2_arg_match(__NR_setsockopt, 2, IPPROTO_TCP, 3, TCP_NODELAY);
359 if (tunable_syslog_enable)
361 reject_nr(__NR_socket, EACCES);
365 void
366 seccomp_sandbox_setup_postlogin(const struct vsf_session* p_sess)
368 int is_anon = p_sess->is_anonymous;
369 int open_flag = kOpenFlags;
370 if (tunable_write_enable)
372 open_flag |= O_ACCMODE;
375 /* Put lstat() first because it is a very hot syscall for large directory
376 * listings. And the current BPF only allows a linear scan of allowed
377 * syscalls.
379 allow_nr(__NR_lstat);
381 /* Allow all the simple pre-login things and then expand upon them. */
382 seccomp_sandbox_setup_prelogin(p_sess);
384 /* Simple file descriptor-based operations. */
385 if (tunable_xferlog_enable || tunable_dual_log_enable ||
386 tunable_lock_upload_files)
388 allow_nr_1_arg_match(__NR_fcntl, 2, F_SETLKW);
389 allow_nr_1_arg_match(__NR_fcntl, 2, F_SETLK);
391 if (tunable_async_abor_enable)
393 allow_nr_2_arg_match(__NR_fcntl, 2, F_SETOWN, 3, vsf_sysutil_getpid());
395 allow_nr_2_arg_match(__NR_setsockopt, 2, SOL_SOCKET, 3, SO_KEEPALIVE);
396 allow_nr_2_arg_match(__NR_setsockopt, 2, SOL_SOCKET, 3, SO_LINGER);
397 allow_nr_2_arg_match(__NR_setsockopt, 2, IPPROTO_IP, 3, IP_TOS);
398 allow_nr(__NR_fstat);
399 allow_nr(__NR_lseek);
400 /* Since we use chroot() to restrict filesystem access, we can just blanket
401 * allow open().
403 allow_nr_1_arg_mask(__NR_open, 2, open_flag);
404 allow_nr_1_arg_mask(__NR_openat, 3, open_flag);
405 /* Other pathname-based metadata queries. */
406 allow_nr(__NR_stat);
407 allow_nr(__NR_readlink);
408 /* Directory handling: query, change, read. */
409 allow_nr(__NR_getcwd);
410 allow_nr(__NR_chdir);
411 allow_nr(__NR_getdents);
412 /* Misc */
413 allow_nr(__NR_umask);
415 /* Config-dependent items follow. */
416 if (tunable_use_sendfile)
418 allow_nr(__NR_sendfile);
420 if (tunable_idle_session_timeout > 0 ||
421 tunable_data_connection_timeout > 0 ||
422 tunable_async_abor_enable)
424 allow_nr(__NR_rt_sigaction);
426 if (tunable_idle_session_timeout > 0 || tunable_data_connection_timeout > 0)
428 allow_nr(__NR_alarm);
431 if (tunable_one_process_model)
433 seccomp_sandbox_setup_data_connections();
434 if (is_anon && tunable_chown_uploads)
436 allow_nr(__NR_fchmod);
437 allow_nr(__NR_fchown);
440 else
442 /* Need to receieve file descriptors from privileged broker. */
443 allow_nr_1_arg_match(__NR_recvmsg, 3, 0);
444 if ((is_anon && tunable_chown_uploads) || tunable_ssl_enable)
446 /* Need to send file descriptors to privileged broker. */
447 allow_nr_1_arg_match(__NR_sendmsg, 3, 0);
451 if (tunable_syslog_enable)
453 /* The ability to pass an address spec isn't needed so disable it. We ensure
454 * the 6th arg (socklen) is 0. We could have checked the 5th arg (sockptr)
455 * but I don't know if 64-bit compares work in the kernel filter, so we're
456 * happy to check the socklen arg, which is 32 bits.
458 allow_nr_1_arg_match(__NR_sendto, 6, 0);
461 if (tunable_text_userdb_names)
463 reject_nr(__NR_socket, EACCES);
464 allow_nr_2_arg_match(__NR_mmap, 3, PROT_READ, 4, MAP_SHARED);
467 if (tunable_write_enable)
469 if (!is_anon || tunable_anon_mkdir_write_enable)
471 allow_nr(__NR_mkdir);
473 if (!is_anon ||
474 tunable_anon_other_write_enable ||
475 tunable_delete_failed_uploads)
477 allow_nr(__NR_unlink);
479 if (!is_anon || tunable_anon_other_write_enable)
481 allow_nr(__NR_rmdir);
482 allow_nr(__NR_rename);
483 allow_nr(__NR_ftruncate);
484 if (tunable_mdtm_write)
486 allow_nr(__NR_utime);
487 allow_nr(__NR_utimes);
490 if (!is_anon && tunable_chmod_enable)
492 allow_nr(__NR_chmod);
497 void
498 seccomp_sandbox_setup_postlogin_broker()
500 seccomp_sandbox_setup_base();
501 seccomp_sandbox_setup_data_connections();
502 allow_nr_1_arg_match(__NR_sendmsg, 3, 0);
505 void
506 seccomp_sandbox_lockdown()
508 size_t len = (s_syscall_index * 2) +
509 (s_1_arg_validations * 3) +
510 (s_2_arg_validations * 5) +
511 (s_3_arg_validations * 7) +
513 struct sock_filter filters[len];
514 struct sock_filter* p_filter = filters;
515 struct sock_fprog prog;
516 size_t i;
517 int ret;
519 prog.len = len;
520 prog.filter = filters;
521 /* Validate the syscall architecture. */
522 p_filter->code = BPF_LD+BPF_W+BPF_ABS;
523 p_filter->jt = 0;
524 p_filter->jf = 0;
525 /* Offset 4 for syscall architecture. */
526 p_filter->k = 4;
527 p_filter++;
528 p_filter->code = BPF_JMP+BPF_JEQ+BPF_K;
529 p_filter->jt = 1;
530 p_filter->jf = 0;
531 /* AUDIT_ARCH_X86_64 */
532 p_filter->k = 0xc000003e;
533 p_filter++;
534 p_filter->code = BPF_RET+BPF_K;
535 p_filter->jt = 0;
536 p_filter->jf = 0;
537 /* SECCOMP_RET_KILL */
538 p_filter->k = 0;
539 p_filter++;
541 /* Load the syscall number. */
542 p_filter->code = BPF_LD+BPF_W+BPF_ABS;
543 p_filter->jt = 0;
544 p_filter->jf = 0;
545 /* Offset 0 for syscall number. */
546 p_filter->k = 0;
547 p_filter++;
549 for (i = 0; i < s_syscall_index; ++i)
551 int block_size = 1;
552 if (s_args_3[i])
554 block_size = 8;
556 else if (s_args_2[i])
558 block_size = 6;
560 else if (s_args_1[i])
562 block_size = 4;
564 /* Check for syscall number match. */
565 p_filter->code = BPF_JMP+BPF_JEQ+BPF_K;
566 p_filter->jt = 0;
567 p_filter->jf = block_size;
568 p_filter->k = s_syscalls[i];
569 p_filter++;
570 /* Check argument matches if necessary. */
571 if (s_args_3[i])
573 p_filter->code = BPF_LD+BPF_W+BPF_ABS;
574 p_filter->jt = 0;
575 p_filter->jf = 0;
576 p_filter->k = 16 + ((s_args_3[i] - 1) * 8);
577 p_filter++;
578 p_filter->code = BPF_JMP+BPF_JEQ+BPF_K;
579 p_filter->jt = 0;
580 p_filter->jf = 5;
581 p_filter->k = s_vals_3[i];
582 p_filter++;
584 if (s_args_2[i])
586 p_filter->code = BPF_LD+BPF_W+BPF_ABS;
587 p_filter->jt = 0;
588 p_filter->jf = 0;
589 p_filter->k = 16 + ((s_args_2[i] - 1) * 8);
590 p_filter++;
591 p_filter->code = BPF_JMP+BPF_JEQ+BPF_K;
592 p_filter->jt = 0;
593 p_filter->jf = 3;
594 p_filter->k = s_vals_2[i];
595 p_filter++;
597 if (s_args_1[i])
599 int arg = s_args_1[i];
600 int code = BPF_JMP+BPF_JEQ+BPF_K;
601 int val = s_vals_1[i];
602 int jt = 0;
603 int jf = 1;
604 if (arg > 100)
606 arg -= 100;
607 code = BPF_JMP+BPF_JSET+BPF_K;
608 val = ~val;
609 jt = 1;
610 jf = 0;
612 p_filter->code = BPF_LD+BPF_W+BPF_ABS;
613 p_filter->jt = 0;
614 p_filter->jf = 0;
615 p_filter->k = 16 + ((arg - 1) * 8);
616 p_filter++;
617 p_filter->code = code;
618 p_filter->jt = jt;
619 p_filter->jf = jf;
620 p_filter->k = val;
621 p_filter++;
623 p_filter->code = BPF_RET+BPF_K;
624 p_filter->jt = 0;
625 p_filter->jf = 0;
626 if (!s_errnos[i])
628 /* SECCOMP_RET_ALLOW */
629 p_filter->k = 0x7fff0000;
631 else
633 /* SECCOMP_RET_ERRNO */
634 p_filter->k = 0x00050000 + s_errnos[i];
636 p_filter++;
637 if (s_args_1[i])
639 /* We trashed the accumulator so put it back. */
640 p_filter->code = BPF_LD+BPF_W+BPF_ABS;
641 p_filter->jt = 0;
642 p_filter->jf = 0;
643 p_filter->k = 0;
644 p_filter++;
647 /* No "allow" matches so kill. */
648 p_filter->code = BPF_RET+BPF_K;
649 p_filter->jt = 0;
650 p_filter->jf = 0;
651 #ifdef DEBUG_SIGSYS
652 /* SECCOMP_RET_TRAP */
653 p_filter->k = 0x00030000;
654 #else
655 /* SECCOMP_RET_KILL */
656 p_filter->k = 0;
657 #endif
659 ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
660 if (ret != 0)
662 if (errno == EINVAL)
664 /* Kernel isn't good enough. */
665 return;
667 die("prctl PR_SET_NO_NEW_PRIVS");
670 if (!tunable_seccomp_sandbox)
672 return;
675 #ifdef DEBUG_SIGSYS
677 struct sigaction sa;
678 memset(&sa, '\0', sizeof(sa));
679 sa.sa_handler = handle_sigsys;
680 sigaction(SIGSYS, &sa, NULL);
682 #endif
684 ret = prctl(PR_SET_SECCOMP, 2, &prog, 0, 0);
685 if (ret != 0)
687 if (errno == EINVAL)
689 /* Kernel isn't good enough. */
690 return;
692 die("prctl PR_SET_SECCOMP failed");
696 #else /* __linux__ && __x86_64__ */
698 void
699 seccomp_sandbox_init()
703 void
704 seccomp_sandbox_setup_prelogin(const struct vsf_session* p_sess)
706 (void) p_sess;
709 void
710 seccomp_sandbox_setup_postlogin(const struct vsf_session* p_sess)
712 (void) p_sess;
715 void
716 seccomp_sandbox_setup_postlogin_broker()
720 void
721 seccomp_sandbox_lockdown()
725 #endif /* __linux__ && __x86_64__ */