merged in amd64 linux emulation from freebsd, but the building infra is
[dragonfly/port-amd64.git] / sys / emulation / linux / amd64 / linux32_sysvec.c
blob2a7ae5398bbf0ef95d01a04532d4783f217b5a7b
1 /*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 Søren Schmidt
7 * All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer
14 * in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * $FreeBSD: src/sys/amd64/linux32/linux32_sysvec.c,v 1.29 2007/05/14 22:40:04 jhb Exp $
33 * $DragonFly$
36 #include "opt_compat.h"
38 #ifndef COMPAT_IA32
39 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
40 #endif
42 #define __ELF_WORD_SIZE 32
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/exec.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_mib.h>
81 #include <compat/linux/linux_signal.h>
82 #include <compat/linux/linux_util.h>
84 MODULE_VERSION(linux, 1);
86 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
88 #define AUXARGS_ENTRY_32(pos, id, val) \
89 do { \
90 suword32(pos++, id); \
91 suword32(pos++, val); \
92 } while (0)
94 #if BYTE_ORDER == LITTLE_ENDIAN
95 #define SHELLMAGIC 0x2123 /* #! */
96 #else
97 #define SHELLMAGIC 0x2321
98 #endif
101 * Allow the sendsig functions to use the ldebug() facility
102 * even though they are not syscalls themselves. Map them
103 * to syscall 0. This is slightly less bogus than using
104 * ldebug(sigreturn).
106 #define LINUX_SYS_linux_rt_sendsig 0
107 #define LINUX_SYS_linux_sendsig 0
109 extern char linux_sigcode[];
110 extern int linux_szsigcode;
112 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
114 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
115 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
117 static int elf_linux_fixup(register_t **stack_base,
118 struct image_params *iparams);
119 static register_t *linux_copyout_strings(struct image_params *imgp);
120 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
121 caddr_t *params);
122 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
123 static void exec_linux_setregs(struct thread *td, u_long entry,
124 u_long stack, u_long ps_strings);
125 static void linux32_fixlimit(struct rlimit *rl, int which);
127 extern LIST_HEAD(futex_list, futex) futex_list;
128 extern struct sx futex_sx;
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_schedtail_tag;
132 static eventhandler_tag linux_exec_tag;
135 * Linux syscalls return negative errno's, we do positive and map them
136 * Reference:
137 * FreeBSD: src/sys/sys/errno.h
138 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
139 * linux-2.6.17.8/include/asm-generic/errno.h
141 static int bsd_to_linux_errno[ELAST + 1] = {
142 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
143 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
144 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
145 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
146 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
147 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
148 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
149 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
150 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
151 -72, -67, -71
154 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
155 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
156 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
157 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
158 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
159 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
160 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
161 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
162 0, LINUX_SIGUSR1, LINUX_SIGUSR2
165 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
166 SIGHUP, SIGINT, SIGQUIT, SIGILL,
167 SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
168 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
169 SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
170 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
171 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
172 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
173 SIGIO, SIGURG, SIGSYS
176 #define LINUX_T_UNKNOWN 255
177 static int _bsd_to_linux_trapcode[] = {
178 LINUX_T_UNKNOWN, /* 0 */
179 6, /* 1 T_PRIVINFLT */
180 LINUX_T_UNKNOWN, /* 2 */
181 3, /* 3 T_BPTFLT */
182 LINUX_T_UNKNOWN, /* 4 */
183 LINUX_T_UNKNOWN, /* 5 */
184 16, /* 6 T_ARITHTRAP */
185 254, /* 7 T_ASTFLT */
186 LINUX_T_UNKNOWN, /* 8 */
187 13, /* 9 T_PROTFLT */
188 1, /* 10 T_TRCTRAP */
189 LINUX_T_UNKNOWN, /* 11 */
190 14, /* 12 T_PAGEFLT */
191 LINUX_T_UNKNOWN, /* 13 */
192 17, /* 14 T_ALIGNFLT */
193 LINUX_T_UNKNOWN, /* 15 */
194 LINUX_T_UNKNOWN, /* 16 */
195 LINUX_T_UNKNOWN, /* 17 */
196 0, /* 18 T_DIVIDE */
197 2, /* 19 T_NMI */
198 4, /* 20 T_OFLOW */
199 5, /* 21 T_BOUND */
200 7, /* 22 T_DNA */
201 8, /* 23 T_DOUBLEFLT */
202 9, /* 24 T_FPOPFLT */
203 10, /* 25 T_TSSFLT */
204 11, /* 26 T_SEGNPFLT */
205 12, /* 27 T_STKFLT */
206 18, /* 28 T_MCHK */
207 19, /* 29 T_XMMFLT */
208 15 /* 30 T_RESERVED */
210 #define bsd_to_linux_trapcode(code) \
211 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
212 _bsd_to_linux_trapcode[(code)]: \
213 LINUX_T_UNKNOWN)
215 struct linux32_ps_strings {
216 u_int32_t ps_argvstr; /* first of 0 or more argument strings */
217 u_int ps_nargvstr; /* the number of argument strings */
218 u_int32_t ps_envstr; /* first of 0 or more environment strings */
219 u_int ps_nenvstr; /* the number of environment strings */
223 * If FreeBSD & Linux have a difference of opinion about what a trap
224 * means, deal with it here.
226 * MPSAFE
228 static int
229 translate_traps(int signal, int trap_code)
231 if (signal != SIGBUS)
232 return signal;
233 switch (trap_code) {
234 case T_PROTFLT:
235 case T_TSSFLT:
236 case T_DOUBLEFLT:
237 case T_PAGEFLT:
238 return SIGSEGV;
239 default:
240 return signal;
244 static int
245 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
247 Elf32_Auxargs *args;
248 Elf32_Addr *base;
249 Elf32_Addr *pos;
251 KASSERT(curthread->td_proc == imgp->proc &&
252 (curthread->td_proc->p_flag & P_SA) == 0,
253 ("unsafe elf_linux_fixup(), should be curproc"));
254 base = (Elf32_Addr *)*stack_base;
255 args = (Elf32_Auxargs *)imgp->auxargs;
256 pos = base + (imgp->args->argc + imgp->args->envc + 2);
258 if (args->trace)
259 AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
260 if (args->execfd != -1)
261 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
262 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
263 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
264 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
265 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
266 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
267 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
268 AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
269 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
270 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
271 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
272 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
273 AUXARGS_ENTRY_32(pos, AT_NULL, 0);
275 free(imgp->auxargs, M_TEMP);
276 imgp->auxargs = NULL;
278 base--;
279 suword32(base, (uint32_t)imgp->args->argc);
280 *stack_base = (register_t *)base;
281 return 0;
284 extern int _ucodesel, _ucode32sel, _udatasel;
285 extern unsigned long linux_sznonrtsigcode;
287 static void
288 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
290 struct thread *td = curthread;
291 struct proc *p = td->td_proc;
292 struct sigacts *psp;
293 struct trapframe *regs;
294 struct l_rt_sigframe *fp, frame;
295 int oonstack;
296 int sig;
297 int code;
299 sig = ksi->ksi_signo;
300 code = ksi->ksi_code;
301 PROC_LOCK_ASSERT(p, MA_OWNED);
302 psp = p->p_sigacts;
303 mtx_assert(&psp->ps_mtx, MA_OWNED);
304 regs = td->td_frame;
305 oonstack = sigonstack(regs->tf_rsp);
307 #ifdef DEBUG
308 if (ldebug(rt_sendsig))
309 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
310 catcher, sig, (void*)mask, code);
311 #endif
313 * Allocate space for the signal handler context.
315 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
316 SIGISMEMBER(psp->ps_sigonstack, sig)) {
317 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
318 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
319 } else
320 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
321 mtx_unlock(&psp->ps_mtx);
324 * Build the argument list for the signal handler.
326 if (p->p_sysent->sv_sigtbl)
327 if (sig <= p->p_sysent->sv_sigsize)
328 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
330 bzero(&frame, sizeof(frame));
332 frame.sf_handler = PTROUT(catcher);
333 frame.sf_sig = sig;
334 frame.sf_siginfo = PTROUT(&fp->sf_si);
335 frame.sf_ucontext = PTROUT(&fp->sf_sc);
337 /* Fill in POSIX parts */
338 frame.sf_si.lsi_signo = sig;
339 frame.sf_si.lsi_code = code;
340 frame.sf_si.lsi_addr = PTROUT(ksi->ksi_addr);
343 * Build the signal context to be used by sigreturn.
345 frame.sf_sc.uc_flags = 0; /* XXX ??? */
346 frame.sf_sc.uc_link = 0; /* XXX ??? */
348 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
349 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
350 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
351 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
352 PROC_UNLOCK(p);
354 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
356 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
357 frame.sf_sc.uc_mcontext.sc_gs = rgs();
358 frame.sf_sc.uc_mcontext.sc_fs = rfs();
359 __asm __volatile("movl %%es,%0" :
360 "=rm" (frame.sf_sc.uc_mcontext.sc_es));
361 __asm __volatile("movl %%ds,%0" :
362 "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
363 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
364 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
365 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
366 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
367 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
368 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
369 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
370 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
371 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
372 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
373 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
374 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
375 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
376 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
378 #ifdef DEBUG
379 if (ldebug(rt_sendsig))
380 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
381 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
382 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
383 #endif
385 if (copyout(&frame, fp, sizeof(frame)) != 0) {
387 * Process has trashed its stack; give it an illegal
388 * instruction to halt it in its tracks.
390 #ifdef DEBUG
391 if (ldebug(rt_sendsig))
392 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
393 fp, oonstack);
394 #endif
395 PROC_LOCK(p);
396 sigexit(td, SIGILL);
400 * Build context to run handler in.
402 regs->tf_rsp = PTROUT(fp);
403 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
404 linux_sznonrtsigcode;
405 regs->tf_rflags &= ~PSL_T;
406 regs->tf_cs = _ucode32sel;
407 regs->tf_ss = _udatasel;
408 load_ds(_udatasel);
409 td->td_pcb->pcb_ds = _udatasel;
410 load_es(_udatasel);
411 td->td_pcb->pcb_es = _udatasel;
412 /* leave user %fs and %gs untouched */
413 PROC_LOCK(p);
414 mtx_lock(&psp->ps_mtx);
419 * Send an interrupt to process.
421 * Stack is set up to allow sigcode stored
422 * in u. to call routine, followed by kcall
423 * to sigreturn routine below. After sigreturn
424 * resets the signal mask, the stack, and the
425 * frame pointer, it returns to the user
426 * specified pc, psl.
428 static void
429 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
431 struct thread *td = curthread;
432 struct proc *p = td->td_proc;
433 struct sigacts *psp;
434 struct trapframe *regs;
435 struct l_sigframe *fp, frame;
436 l_sigset_t lmask;
437 int oonstack, i;
438 int sig, code;
440 sig = ksi->ksi_signo;
441 code = ksi->ksi_code;
442 PROC_LOCK_ASSERT(p, MA_OWNED);
443 psp = p->p_sigacts;
444 mtx_assert(&psp->ps_mtx, MA_OWNED);
445 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
446 /* Signal handler installed with SA_SIGINFO. */
447 linux_rt_sendsig(catcher, ksi, mask);
448 return;
451 regs = td->td_frame;
452 oonstack = sigonstack(regs->tf_rsp);
454 #ifdef DEBUG
455 if (ldebug(sendsig))
456 printf(ARGS(sendsig, "%p, %d, %p, %u"),
457 catcher, sig, (void*)mask, code);
458 #endif
461 * Allocate space for the signal handler context.
463 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
464 SIGISMEMBER(psp->ps_sigonstack, sig)) {
465 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
466 td->td_sigstk.ss_size - sizeof(struct l_sigframe));
467 } else
468 fp = (struct l_sigframe *)regs->tf_rsp - 1;
469 mtx_unlock(&psp->ps_mtx);
470 PROC_UNLOCK(p);
473 * Build the argument list for the signal handler.
475 if (p->p_sysent->sv_sigtbl)
476 if (sig <= p->p_sysent->sv_sigsize)
477 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
479 bzero(&frame, sizeof(frame));
481 frame.sf_handler = PTROUT(catcher);
482 frame.sf_sig = sig;
484 bsd_to_linux_sigset(mask, &lmask);
487 * Build the signal context to be used by sigreturn.
489 frame.sf_sc.sc_mask = lmask.__bits[0];
490 frame.sf_sc.sc_gs = rgs();
491 frame.sf_sc.sc_fs = rfs();
492 __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
493 __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
494 frame.sf_sc.sc_edi = regs->tf_rdi;
495 frame.sf_sc.sc_esi = regs->tf_rsi;
496 frame.sf_sc.sc_ebp = regs->tf_rbp;
497 frame.sf_sc.sc_ebx = regs->tf_rbx;
498 frame.sf_sc.sc_edx = regs->tf_rdx;
499 frame.sf_sc.sc_ecx = regs->tf_rcx;
500 frame.sf_sc.sc_eax = regs->tf_rax;
501 frame.sf_sc.sc_eip = regs->tf_rip;
502 frame.sf_sc.sc_cs = regs->tf_cs;
503 frame.sf_sc.sc_eflags = regs->tf_rflags;
504 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
505 frame.sf_sc.sc_ss = regs->tf_ss;
506 frame.sf_sc.sc_err = regs->tf_err;
507 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
509 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
510 frame.sf_extramask[i] = lmask.__bits[i+1];
512 if (copyout(&frame, fp, sizeof(frame)) != 0) {
514 * Process has trashed its stack; give it an illegal
515 * instruction to halt it in its tracks.
517 PROC_LOCK(p);
518 sigexit(td, SIGILL);
522 * Build context to run handler in.
524 regs->tf_rsp = PTROUT(fp);
525 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
526 regs->tf_rflags &= ~PSL_T;
527 regs->tf_cs = _ucode32sel;
528 regs->tf_ss = _udatasel;
529 load_ds(_udatasel);
530 td->td_pcb->pcb_ds = _udatasel;
531 load_es(_udatasel);
532 td->td_pcb->pcb_es = _udatasel;
533 /* leave user %fs and %gs untouched */
534 PROC_LOCK(p);
535 mtx_lock(&psp->ps_mtx);
539 * System call to cleanup state after a signal
540 * has been taken. Reset signal mask and
541 * stack state from context left by sendsig (above).
542 * Return to previous pc and psl as specified by
543 * context left by sendsig. Check carefully to
544 * make sure that the user has not modified the
545 * psl to gain improper privileges or to cause
546 * a machine fault.
549 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
551 struct proc *p = td->td_proc;
552 struct l_sigframe frame;
553 struct trapframe *regs;
554 l_sigset_t lmask;
555 int eflags, i;
556 ksiginfo_t ksi;
558 regs = td->td_frame;
560 #ifdef DEBUG
561 if (ldebug(sigreturn))
562 printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
563 #endif
565 * The trampoline code hands us the sigframe.
566 * It is unsafe to keep track of it ourselves, in the event that a
567 * program jumps out of a signal handler.
569 if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
570 return (EFAULT);
573 * Check for security violations.
575 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
576 eflags = frame.sf_sc.sc_eflags;
578 * XXX do allow users to change the privileged flag PSL_RF. The
579 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
580 * sometimes set it there too. tf_eflags is kept in the signal
581 * context during signal handling and there is no other place
582 * to remember it, so the PSL_RF bit may be corrupted by the
583 * signal handler without us knowing. Corruption of the PSL_RF
584 * bit at worst causes one more or one less debugger trap, so
585 * allowing it is fairly harmless.
587 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
588 return(EINVAL);
591 * Don't allow users to load a valid privileged %cs. Let the
592 * hardware check for invalid selectors, excess privilege in
593 * other selectors, invalid %eip's and invalid %esp's.
595 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
596 if (!CS_SECURE(frame.sf_sc.sc_cs)) {
597 ksiginfo_init_trap(&ksi);
598 ksi.ksi_signo = SIGBUS;
599 ksi.ksi_code = BUS_OBJERR;
600 ksi.ksi_trapno = T_PROTFLT;
601 ksi.ksi_addr = (void *)regs->tf_rip;
602 trapsignal(td, &ksi);
603 return(EINVAL);
606 lmask.__bits[0] = frame.sf_sc.sc_mask;
607 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
608 lmask.__bits[i+1] = frame.sf_extramask[i];
609 PROC_LOCK(p);
610 linux_to_bsd_sigset(&lmask, &td->td_sigmask);
611 SIG_CANTMASK(td->td_sigmask);
612 signotify(td);
613 PROC_UNLOCK(p);
616 * Restore signal context.
618 /* Selectors were restored by the trampoline. */
619 regs->tf_rdi = frame.sf_sc.sc_edi;
620 regs->tf_rsi = frame.sf_sc.sc_esi;
621 regs->tf_rbp = frame.sf_sc.sc_ebp;
622 regs->tf_rbx = frame.sf_sc.sc_ebx;
623 regs->tf_rdx = frame.sf_sc.sc_edx;
624 regs->tf_rcx = frame.sf_sc.sc_ecx;
625 regs->tf_rax = frame.sf_sc.sc_eax;
626 regs->tf_rip = frame.sf_sc.sc_eip;
627 regs->tf_cs = frame.sf_sc.sc_cs;
628 regs->tf_rflags = eflags;
629 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
630 regs->tf_ss = frame.sf_sc.sc_ss;
632 return (EJUSTRETURN);
636 * System call to cleanup state after a signal
637 * has been taken. Reset signal mask and
638 * stack state from context left by rt_sendsig (above).
639 * Return to previous pc and psl as specified by
640 * context left by sendsig. Check carefully to
641 * make sure that the user has not modified the
642 * psl to gain improper privileges or to cause
643 * a machine fault.
646 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
648 struct proc *p = td->td_proc;
649 struct l_ucontext uc;
650 struct l_sigcontext *context;
651 l_stack_t *lss;
652 stack_t ss;
653 struct trapframe *regs;
654 int eflags;
655 ksiginfo_t ksi;
657 regs = td->td_frame;
659 #ifdef DEBUG
660 if (ldebug(rt_sigreturn))
661 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
662 #endif
664 * The trampoline code hands us the ucontext.
665 * It is unsafe to keep track of it ourselves, in the event that a
666 * program jumps out of a signal handler.
668 if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
669 return (EFAULT);
671 context = &uc.uc_mcontext;
674 * Check for security violations.
676 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
677 eflags = context->sc_eflags;
679 * XXX do allow users to change the privileged flag PSL_RF. The
680 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
681 * sometimes set it there too. tf_eflags is kept in the signal
682 * context during signal handling and there is no other place
683 * to remember it, so the PSL_RF bit may be corrupted by the
684 * signal handler without us knowing. Corruption of the PSL_RF
685 * bit at worst causes one more or one less debugger trap, so
686 * allowing it is fairly harmless.
688 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
689 return(EINVAL);
692 * Don't allow users to load a valid privileged %cs. Let the
693 * hardware check for invalid selectors, excess privilege in
694 * other selectors, invalid %eip's and invalid %esp's.
696 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
697 if (!CS_SECURE(context->sc_cs)) {
698 ksiginfo_init_trap(&ksi);
699 ksi.ksi_signo = SIGBUS;
700 ksi.ksi_code = BUS_OBJERR;
701 ksi.ksi_trapno = T_PROTFLT;
702 ksi.ksi_addr = (void *)regs->tf_rip;
703 trapsignal(td, &ksi);
704 return(EINVAL);
707 PROC_LOCK(p);
708 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
709 SIG_CANTMASK(td->td_sigmask);
710 signotify(td);
711 PROC_UNLOCK(p);
714 * Restore signal context
716 /* Selectors were restored by the trampoline. */
717 regs->tf_rdi = context->sc_edi;
718 regs->tf_rsi = context->sc_esi;
719 regs->tf_rbp = context->sc_ebp;
720 regs->tf_rbx = context->sc_ebx;
721 regs->tf_rdx = context->sc_edx;
722 regs->tf_rcx = context->sc_ecx;
723 regs->tf_rax = context->sc_eax;
724 regs->tf_rip = context->sc_eip;
725 regs->tf_cs = context->sc_cs;
726 regs->tf_rflags = eflags;
727 regs->tf_rsp = context->sc_esp_at_signal;
728 regs->tf_ss = context->sc_ss;
731 * call sigaltstack & ignore results..
733 lss = &uc.uc_stack;
734 ss.ss_sp = PTRIN(lss->ss_sp);
735 ss.ss_size = lss->ss_size;
736 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
738 #ifdef DEBUG
739 if (ldebug(rt_sigreturn))
740 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
741 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
742 #endif
743 (void)kern_sigaltstack(td, &ss, NULL);
745 return (EJUSTRETURN);
749 * MPSAFE
751 static void
752 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
754 args[0] = tf->tf_rbx;
755 args[1] = tf->tf_rcx;
756 args[2] = tf->tf_rdx;
757 args[3] = tf->tf_rsi;
758 args[4] = tf->tf_rdi;
759 args[5] = tf->tf_rbp; /* Unconfirmed */
760 *params = NULL; /* no copyin */
764 * If a linux binary is exec'ing something, try this image activator
765 * first. We override standard shell script execution in order to
766 * be able to modify the interpreter path. We only do this if a linux
767 * binary is doing the exec, so we do not create an EXEC module for it.
769 static int exec_linux_imgact_try(struct image_params *iparams);
771 static int
772 exec_linux_imgact_try(struct image_params *imgp)
774 const char *head = (const char *)imgp->image_header;
775 char *rpath;
776 int error = -1, len;
779 * The interpreter for shell scripts run from a linux binary needs
780 * to be located in /compat/linux if possible in order to recursively
781 * maintain linux path emulation.
783 if (((const short *)head)[0] == SHELLMAGIC) {
785 * Run our normal shell image activator. If it succeeds attempt
786 * to use the alternate path for the interpreter. If an alternate
787 * path is found, use our stringspace to store it.
789 if ((error = exec_shell_imgact(imgp)) == 0) {
790 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
791 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
792 if (rpath != NULL) {
793 len = strlen(rpath) + 1;
795 if (len <= MAXSHELLCMDLEN) {
796 memcpy(imgp->interpreter_name, rpath, len);
798 free(rpath, M_TEMP);
802 return(error);
806 * Clear registers on exec
807 * XXX copied from ia32_signal.c.
809 static void
810 exec_linux_setregs(td, entry, stack, ps_strings)
811 struct thread *td;
812 u_long entry;
813 u_long stack;
814 u_long ps_strings;
816 struct trapframe *regs = td->td_frame;
817 struct pcb *pcb = td->td_pcb;
819 critical_enter();
820 wrmsr(MSR_FSBASE, 0);
821 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
822 pcb->pcb_fsbase = 0;
823 pcb->pcb_gsbase = 0;
824 critical_exit();
825 load_ds(_udatasel);
826 load_es(_udatasel);
827 load_fs(_udatasel);
828 load_gs(_udatasel);
829 pcb->pcb_ds = _udatasel;
830 pcb->pcb_es = _udatasel;
831 pcb->pcb_fs = _udatasel;
832 pcb->pcb_gs = _udatasel;
834 bzero((char *)regs, sizeof(struct trapframe));
835 regs->tf_rip = entry;
836 regs->tf_rsp = stack;
837 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
838 regs->tf_ss = _udatasel;
839 regs->tf_cs = _ucode32sel;
840 regs->tf_rbx = ps_strings;
841 load_cr0(rcr0() | CR0_MP | CR0_TS);
842 fpstate_drop(td);
844 /* Return via doreti so that we can change to a different %cs */
845 pcb->pcb_flags |= PCB_FULLCTX;
846 td->td_retval[1] = 0;
850 * XXX copied from ia32_sysvec.c.
852 static register_t *
853 linux_copyout_strings(struct image_params *imgp)
855 int argc, envc;
856 u_int32_t *vectp;
857 char *stringp, *destp;
858 u_int32_t *stack_base;
859 struct linux32_ps_strings *arginfo;
860 int sigcodesz;
863 * Calculate string base and vector table pointers.
864 * Also deal with signal trampoline code for this exec type.
866 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
867 sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
868 destp = (caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
869 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
872 * install sigcode
874 if (sigcodesz)
875 copyout(imgp->proc->p_sysent->sv_sigcode,
876 ((caddr_t)arginfo - sigcodesz), sigcodesz);
879 * If we have a valid auxargs ptr, prepare some room
880 * on the stack.
882 if (imgp->auxargs) {
884 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
885 * lower compatibility.
887 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
888 : (AT_COUNT * 2);
890 * The '+ 2' is for the null pointers at the end of each of
891 * the arg and env vector sets,and imgp->auxarg_size is room
892 * for argument of Runtime loader.
894 vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
895 imgp->auxarg_size) * sizeof(u_int32_t));
897 } else
899 * The '+ 2' is for the null pointers at the end of each of
900 * the arg and env vector sets
902 vectp = (u_int32_t *)
903 (destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
906 * vectp also becomes our initial stack base
908 stack_base = vectp;
910 stringp = imgp->args->begin_argv;
911 argc = imgp->args->argc;
912 envc = imgp->args->envc;
914 * Copy out strings - arguments and environment.
916 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
919 * Fill in "ps_strings" struct for ps, w, etc.
921 suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
922 suword32(&arginfo->ps_nargvstr, argc);
925 * Fill in argument portion of vector table.
927 for (; argc > 0; --argc) {
928 suword32(vectp++, (u_int32_t)(intptr_t)destp);
929 while (*stringp++ != 0)
930 destp++;
931 destp++;
934 /* a null vector table pointer separates the argp's from the envp's */
935 suword32(vectp++, 0);
937 suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
938 suword32(&arginfo->ps_nenvstr, envc);
941 * Fill in environment portion of vector table.
943 for (; envc > 0; --envc) {
944 suword32(vectp++, (u_int32_t)(intptr_t)destp);
945 while (*stringp++ != 0)
946 destp++;
947 destp++;
950 /* end of vector table is a null pointer */
951 suword32(vectp, 0);
953 return ((register_t *)stack_base);
956 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
957 "32-bit Linux emulation");
959 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
960 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
961 &linux32_maxdsiz, 0, "");
962 static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
963 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
964 &linux32_maxssiz, 0, "");
965 static u_long linux32_maxvmem = LINUX32_MAXVMEM;
966 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
967 &linux32_maxvmem, 0, "");
969 static void
970 linux32_fixlimit(struct rlimit *rl, int which)
973 switch (which) {
974 case RLIMIT_DATA:
975 if (linux32_maxdsiz != 0) {
976 if (rl->rlim_cur > linux32_maxdsiz)
977 rl->rlim_cur = linux32_maxdsiz;
978 if (rl->rlim_max > linux32_maxdsiz)
979 rl->rlim_max = linux32_maxdsiz;
981 break;
982 case RLIMIT_STACK:
983 if (linux32_maxssiz != 0) {
984 if (rl->rlim_cur > linux32_maxssiz)
985 rl->rlim_cur = linux32_maxssiz;
986 if (rl->rlim_max > linux32_maxssiz)
987 rl->rlim_max = linux32_maxssiz;
989 break;
990 case RLIMIT_VMEM:
991 if (linux32_maxvmem != 0) {
992 if (rl->rlim_cur > linux32_maxvmem)
993 rl->rlim_cur = linux32_maxvmem;
994 if (rl->rlim_max > linux32_maxvmem)
995 rl->rlim_max = linux32_maxvmem;
997 break;
1001 struct sysentvec elf_linux_sysvec = {
1002 LINUX_SYS_MAXSYSCALL,
1003 linux_sysent,
1005 LINUX_SIGTBLSZ,
1006 bsd_to_linux_signal,
1007 ELAST + 1,
1008 bsd_to_linux_errno,
1009 translate_traps,
1010 elf_linux_fixup,
1011 linux_sendsig,
1012 linux_sigcode,
1013 &linux_szsigcode,
1014 linux_prepsyscall,
1015 "Linux ELF32",
1016 elf32_coredump,
1017 exec_linux_imgact_try,
1018 LINUX_MINSIGSTKSZ,
1019 PAGE_SIZE,
1020 VM_MIN_ADDRESS,
1021 LINUX32_USRSTACK,
1022 LINUX32_USRSTACK,
1023 LINUX32_PS_STRINGS,
1024 VM_PROT_ALL,
1025 linux_copyout_strings,
1026 exec_linux_setregs,
1027 linux32_fixlimit
1030 static Elf32_Brandinfo linux_brand = {
1031 ELFOSABI_LINUX,
1032 EM_386,
1033 "Linux",
1034 "/compat/linux",
1035 "/lib/ld-linux.so.1",
1036 &elf_linux_sysvec,
1037 NULL,
1038 BI_CAN_EXEC_DYN,
1041 static Elf32_Brandinfo linux_glibc2brand = {
1042 ELFOSABI_LINUX,
1043 EM_386,
1044 "Linux",
1045 "/compat/linux",
1046 "/lib/ld-linux.so.2",
1047 &elf_linux_sysvec,
1048 NULL,
1049 BI_CAN_EXEC_DYN,
1052 Elf32_Brandinfo *linux_brandlist[] = {
1053 &linux_brand,
1054 &linux_glibc2brand,
1055 NULL
1058 static int
1059 linux_elf_modevent(module_t mod, int type, void *data)
1061 Elf32_Brandinfo **brandinfo;
1062 int error;
1063 struct linux_ioctl_handler **lihp;
1064 struct linux_device_handler **ldhp;
1066 error = 0;
1068 switch(type) {
1069 case MOD_LOAD:
1070 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1071 ++brandinfo)
1072 if (elf32_insert_brand_entry(*brandinfo) < 0)
1073 error = EINVAL;
1074 if (error == 0) {
1075 SET_FOREACH(lihp, linux_ioctl_handler_set)
1076 linux_ioctl_register_handler(*lihp);
1077 SET_FOREACH(ldhp, linux_device_handler_set)
1078 linux_device_register_handler(*ldhp);
1079 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1080 sx_init(&emul_shared_lock, "emuldata->shared lock");
1081 LIST_INIT(&futex_list);
1082 sx_init(&futex_sx, "futex protection lock");
1083 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
1084 NULL, 1000);
1085 linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
1086 NULL, 1000);
1087 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
1088 NULL, 1000);
1089 if (bootverbose)
1090 printf("Linux ELF exec handler installed\n");
1091 } else
1092 printf("cannot insert Linux ELF brand handler\n");
1093 break;
1094 case MOD_UNLOAD:
1095 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1096 ++brandinfo)
1097 if (elf32_brand_inuse(*brandinfo))
1098 error = EBUSY;
1099 if (error == 0) {
1100 for (brandinfo = &linux_brandlist[0];
1101 *brandinfo != NULL; ++brandinfo)
1102 if (elf32_remove_brand_entry(*brandinfo) < 0)
1103 error = EINVAL;
1105 if (error == 0) {
1106 SET_FOREACH(lihp, linux_ioctl_handler_set)
1107 linux_ioctl_unregister_handler(*lihp);
1108 SET_FOREACH(ldhp, linux_device_handler_set)
1109 linux_device_unregister_handler(*ldhp);
1110 mtx_destroy(&emul_lock);
1111 sx_destroy(&emul_shared_lock);
1112 sx_destroy(&futex_sx);
1113 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1114 EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1115 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1116 if (bootverbose)
1117 printf("Linux ELF exec handler removed\n");
1118 } else
1119 printf("Could not deinstall ELF interpreter entry\n");
1120 break;
1121 default:
1122 return EOPNOTSUPP;
1124 return error;
1127 static moduledata_t linux_elf_mod = {
1128 "linuxelf",
1129 linux_elf_modevent,
1133 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);