From 16a434475b7bd6880282d0b4683759b037023759 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 21 Oct 2010 16:07:05 -0700 Subject: [PATCH] kernel - (mainly x86_64) - Fix a number of rare races * Move the MP lock from outside to inside exit1(), also fixing an issue where sigexit() was calling exit1() without it. * Move calls to dsched_exit_thread() and biosched_done() out of the platform code and into the mainline code. This also fixes an issue where the code was improperly blocking way too late in the thread termination code, after the point where it had been descheduled permanently and tsleep decomissioned for the thread. * Cleanup and document related code areas. * Fix a missing proc_token release in the SIGKILL exit path. * Fix FAKE_MCOUNT()s in the x86-64 code. These are NOPs anyway (since kernel profiling doesn't work), but fix them anyway. * Use APIC_PUSH_FRAME() in the Xcpustop assembly code for x86-64 in order to properly acquire a working %gs. This may improve the handling of panic()s on x86_64. * Also fix some cases if #if JG'd (ifdef'd out) code in case the code is ever used later on. * Protect set_user_TLS() with a critical section to be safe. * Add debug code to help track down further x86-64 seg-fault issues, and provide better kprintf()s for the debug path in question. --- sys/emulation/linux/i386/linux_machdep.c | 8 ++--- sys/kern/kern_exit.c | 17 ++++++--- sys/kern/kern_sig.c | 12 ++++--- sys/kern/lwkt_thread.c | 15 ++++++-- sys/platform/pc32/i386/trap.c | 2 ++ sys/platform/pc32/i386/vm_machdep.c | 2 -- sys/platform/pc64/apic/apic_vector.s | 56 +++++------------------------- sys/platform/pc64/icu/icu_vector.s | 2 +- sys/platform/pc64/x86_64/exception.S | 32 ++++++++++------- sys/platform/pc64/x86_64/ipl.s | 12 +++---- sys/platform/pc64/x86_64/machdep.c | 6 ++-- sys/platform/pc64/x86_64/swtch.s | 4 +-- sys/platform/pc64/x86_64/tls.c | 11 +++++- sys/platform/pc64/x86_64/trap.c | 29 +++++++++++++--- sys/platform/pc64/x86_64/vm_machdep.c | 14 ++++---- sys/platform/vkernel/i386/trap.c | 2 ++ sys/platform/vkernel/i386/vm_machdep.c | 2 -- sys/platform/vkernel64/x86_64/trap.c | 2 ++ sys/platform/vkernel64/x86_64/vm_machdep.c | 13 +++---- 19 files changed, 131 insertions(+), 110 deletions(-) diff --git a/sys/emulation/linux/i386/linux_machdep.c b/sys/emulation/linux/i386/linux_machdep.c index 782d15b3a5..12295701e0 100644 --- a/sys/emulation/linux/i386/linux_machdep.c +++ b/sys/emulation/linux/i386/linux_machdep.c @@ -404,10 +404,8 @@ sys_linux_exit_group(struct linux_exit_group_args *args) if (em->s->refs == 1) { EMUL_UNLOCK(); - get_mplock(); exit1(W_EXITCODE(rval, 0)); - /* notreached */ - rel_mplock(); + /* NOTREACHED */ return (0); } KKASSERT(em->proc == curproc); @@ -431,10 +429,8 @@ sys_linux_exit_group(struct linux_exit_group_args *args) } EMUL_UNLOCK(); - get_mplock(); exit1(W_EXITCODE(rval, 0)); - rel_mplock(); - /* notreached */ + /* NOTREACHED */ return (0); } diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 5016bd75a4..08f652c808 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -117,10 +118,8 @@ struct lwplist deadlwp_list[MAXCPU]; int sys_exit(struct exit_args *uap) { - get_mplock(); exit1(W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ - rel_mplock(); } /* @@ -283,6 +282,8 @@ exit1(int rv) panic("Going nowhere without my init!"); } + get_mplock(); + varsymset_clean(&p->p_varsymset); lockuninit(&p->p_varsymset.vx_lock); /* @@ -610,6 +611,14 @@ lwp_exit(int masterexit) PHOLD(p); /* + * Do any remaining work that might block on us. We should be + * coded such that further blocking is ok after decrementing + * p_nthreads but don't take the chance. + */ + dsched_exit_thread(td); + biosched_done(curthread); + + /* * We have to use the reaper for all the LWPs except the one doing * the master exit. The LWP doing the master exit can just be * left on p_lwps and the process reaper will deal with it @@ -620,11 +629,11 @@ lwp_exit(int masterexit) --p->p_nthreads; wakeup(&p->p_nthreads); LIST_INSERT_HEAD(&deadlwp_list[mycpuid], lp, u.lwp_reap_entry); - taskqueue_enqueue(taskqueue_thread[mycpuid], deadlwp_task[mycpuid]); + taskqueue_enqueue(taskqueue_thread[mycpuid], + deadlwp_task[mycpuid]); } else { --p->p_nthreads; } - biosched_done(curthread); cpu_lwp_exit(); } diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index f930f27b3a..30d4cd03ef 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -352,9 +352,10 @@ kern_sigaction(int sig, struct sigaction *act, struct sigaction *oact) FOREACH_LWP_IN_PROC(lp, p) { SIGDELSET(lp->lwp_siglist, sig); } - if (sig != SIGCONT) + if (sig != SIGCONT) { /* easier in ksignal */ SIGADDSET(p->p_sigignore, sig); + } SIGDELSET(p->p_sigcatch, sig); } else { SIGDELSET(p->p_sigignore, sig); @@ -1458,8 +1459,6 @@ proc_unstop(struct proc *p) /* * No requirements. - * - * XXX: Holds the proc_token for longer than it probably needs to. */ static int kern_sigtimedwait(sigset_t waitset, siginfo_t *info, struct timespec *timeout) @@ -1564,8 +1563,11 @@ kern_sigtimedwait(sigset_t waitset, siginfo_t *info, struct timespec *timeout) info->si_signo = sig; lwp_delsig(lp, sig); /* take the signal! */ - if (sig == SIGKILL) + if (sig == SIGKILL) { + lwkt_reltoken(&proc_token); sigexit(lp, sig); + /* NOT REACHED */ + } } lwkt_reltoken(&proc_token); @@ -1990,6 +1992,8 @@ killproc(struct proc *p, char *why) * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. + * + * This routine does not return. */ void sigexit(struct lwp *lp, int sig) diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 99626dee98..f094be4786 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -1643,9 +1643,14 @@ lwkt_exit(void) thread_t std; globaldata_t gd; + /* + * Do any cleanup that might block here + */ if (td->td_flags & TDF_VERBOSE) kprintf("kthread %p %s has exited\n", td, td->td_comm); caps_exit(td); + biosched_done(td); + dsched_exit_thread(td); /* * Get us into a critical section to interlock gd_freetd and loop @@ -1663,14 +1668,18 @@ lwkt_exit(void) /* * Remove thread resources from kernel lists and deschedule us for - * the last time. + * the last time. We cannot block after this point or we may end + * up with a stale td on the tsleepq. */ if (td->td_flags & TDF_TSLEEPQ) tsleep_remove(td); - biosched_done(td); - dsched_exit_thread(td); lwkt_deschedule_self(td); lwkt_remove_tdallq(td); + + /* + * Final cleanup + */ + KKASSERT(gd->gd_freetd == NULL); if (td->td_flags & TDF_ALLOCATED_THREAD) gd->gd_freetd = td; cpu_thread_exit(); diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c index 762e587070..0a1be98e42 100644 --- a/sys/platform/pc32/i386/trap.c +++ b/sys/platform/pc32/i386/trap.c @@ -281,6 +281,8 @@ recheck: /* * Post any pending signals. If running a virtual kernel be sure * to restore the virtual kernel's vmspace before posting the signal. + * + * WARNING! postsig() can exit and not return. */ if ((sig = CURSIG_TRACE(lp)) != 0) { get_mplock(); diff --git a/sys/platform/pc32/i386/vm_machdep.c b/sys/platform/pc32/i386/vm_machdep.c index 095e9c32e7..5d4a44fd1b 100644 --- a/sys/platform/pc32/i386/vm_machdep.c +++ b/sys/platform/pc32/i386/vm_machdep.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -301,7 +300,6 @@ cpu_lwp_exit(void) } td->td_gd->gd_cnt.v_swtch++; - dsched_exit_thread(td); crit_enter_quick(td); if (td->td_flags & TDF_TSLEEPQ) tsleep_remove(td); diff --git a/sys/platform/pc64/apic/apic_vector.s b/sys/platform/pc64/apic/apic_vector.s index 2bd8d07e20..8d01976a23 100644 --- a/sys/platform/pc64/apic/apic_vector.s +++ b/sys/platform/pc64/apic/apic_vector.s @@ -123,7 +123,7 @@ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ APIC_PUSH_FRAME ; \ - FAKE_MCOUNT(15*4(%esp)) ; \ + FAKE_MCOUNT(TF_RIP(%rsp)) ; \ MASK_LEVEL_IRQ(irq_num) ; \ movq lapic, %rax ; \ movl $0, LA_EOI(%rax) ; \ @@ -193,6 +193,7 @@ Xinvltlb: /* * Executed by a CPU when it receives an Xcpustop IPI from another CPU, * + * - We cannot call doreti * - Signals its receipt. * - Waits for permission to restart. * - Processing pending IPIQ events while waiting. @@ -203,42 +204,16 @@ Xinvltlb: SUPERALIGN_TEXT .globl Xcpustop Xcpustop: - pushq %rbp - movq %rsp, %rbp - /* We save registers that are not preserved across function calls. */ - /* JG can be re-written with mov's */ - pushq %rax - pushq %rcx - pushq %rdx - pushq %rsi - pushq %rdi - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - -#if JG - /* JGXXX switch to kernel %gs? */ - pushl %ds /* save current data segment */ - pushl %fs - - movl $KDSEL, %eax - mov %ax, %ds /* use KERNEL data segment */ - movl $KPSEL, %eax - mov %ax, %fs -#endif - + APIC_PUSH_FRAME movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - /* JG */ movl PCPU(cpuid), %eax imull $PCB_SIZE, %eax leaq CNAME(stoppcbs), %rdi addq %rax, %rdi call CNAME(savectx) /* Save process context */ - - + movl PCPU(cpuid), %eax /* @@ -253,6 +228,7 @@ Xcpustop: pushq %rax call lwkt_smp_stopped popq %rax + pause btl %eax, started_cpus /* while (!(started_cpus & (1<gd_common_tss.tss_rsp0 &= ~0xFul; + gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF; gd->gd_rsp0 = gd->gd_common_tss.tss_rsp0; /* doublefault stack space, runs on ist1 */ diff --git a/sys/platform/pc64/x86_64/swtch.s b/sys/platform/pc64/x86_64/swtch.s index 80efbc8b3e..b0c29692f1 100644 --- a/sys/platform/pc64/x86_64/swtch.s +++ b/sys/platform/pc64/x86_64/swtch.s @@ -346,8 +346,8 @@ ENTRY(cpu_heavy_restore) jnz 2f #endif - /* JG - * Going back to the common_tss. We may need to update TSS_ESP0 + /* + * Going back to the common_tss. We may need to update TSS_RSP0 * which sets the top of the supervisor stack when entering from * usermode. The PCB is at the top of the stack but we need another * 16 bytes to take vm86 into account. diff --git a/sys/platform/pc64/x86_64/tls.c b/sys/platform/pc64/x86_64/tls.c index 578920fa5d..c4feac802b 100644 --- a/sys/platform/pc64/x86_64/tls.c +++ b/sys/platform/pc64/x86_64/tls.c @@ -141,7 +141,14 @@ sys_get_tls_area(struct get_tls_area_args *uap) } /* - * Install the TLS + * Install the TLS. + * + * It shouldn't be possible for a preemptive thread switch to do anything + * more than set gd_user_fs and wrmsr for us. Even though there is a window + * where gd_user_fs/gd_user_gs do not match the MSRs no preemptive thread + * switch should ever switch to any heavy weight thread other than our own. + * + * Still, use a critical section to be safe. * * MPSAFE */ @@ -151,6 +158,7 @@ set_user_TLS(void) struct mdglobaldata *gd = mdcpu; thread_t td = gd->mi.gd_curthread; + crit_enter_quick(td); td->td_pcb->pcb_fsbase = (register_t)td->td_tls.info[0].base; td->td_pcb->pcb_gsbase = (register_t)td->td_tls.info[1].base; if (gd->gd_user_fs != td->td_pcb->pcb_fsbase) { @@ -161,5 +169,6 @@ set_user_TLS(void) gd->gd_user_gs = td->td_pcb->pcb_gsbase; wrmsr(MSR_KGSBASE, gd->gd_user_gs); } + crit_exit_quick(td); } diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index 67d72ae964..70059b74bb 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -144,6 +144,9 @@ static char *trap_msg[] = { static int ddb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, &ddb_on_nmi, 0, "Go to DDB on NMI"); +static int ddb_on_seg_fault = 0; +SYSCTL_INT(_machdep, OID_AUTO, ddb_on_seg_fault, CTLFLAG_RW, + &ddb_on_seg_fault, 0, "Go to DDB on user seg-fault"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, @@ -241,6 +244,8 @@ recheck: /* * Post any pending signals. If running a virtual kernel be sure * to restore the virtual kernel's vmspace before posting the signal. + * + * WARNING! postsig() can exit and not return. */ if ((sig = CURSIG_TRACE(lp)) != 0) { get_mplock(); @@ -800,14 +805,18 @@ trap_pfault(struct trapframe *frame, int usermode) vm_prot_t ftype; thread_t td = curthread; struct lwp *lp = td->td_lwp; + struct proc *p; va = trunc_page(frame->tf_addr); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ - if (usermode) + if (usermode) { + fault_flags = -1; + ftype = -1; goto nogo; + } map = &kernel_map; } else { @@ -819,8 +828,11 @@ trap_pfault(struct trapframe *frame, int usermode) if (lp != NULL) vm = lp->lwp_vmspace; - if (vm == NULL) + if (vm == NULL) { + fault_flags = -1; + ftype = -1; goto nogo; + } map = &vm->vm_map; } @@ -863,6 +875,7 @@ trap_pfault(struct trapframe *frame, int usermode) * Don't have to worry about process locking or stacks * in the kernel. */ + fault_flags = VM_FAULT_NORMAL; rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } @@ -883,10 +896,16 @@ nogo: * NOTE: on x86_64 we have a tf_addr field in the trapframe, no * kludge is needed to pass the fault address to signal handlers. */ - struct proc *p = td->td_proc; + p = td->td_proc; if (td->td_lwp->lwp_vkernel == NULL) { - kprintf("seg-fault accessing address %p rip=%p pid=%d p_comm=%s\n", - (void *)va, (void *)frame->tf_rip, p->p_pid, p->p_comm); + kprintf("seg-fault ft=%04x ff=%04x addr=%p rip=%p " + "pid=%d p_comm=%s\n", + ftype, fault_flags, + (void *)frame->tf_addr, + (void *)frame->tf_rip, + p->p_pid, p->p_comm); + if (ddb_on_seg_fault) + Debugger("ddb_on_seg_fault"); } /* Debugger("seg-fault"); */ diff --git a/sys/platform/pc64/x86_64/vm_machdep.c b/sys/platform/pc64/x86_64/vm_machdep.c index c74edb6cbe..ec482993c3 100644 --- a/sys/platform/pc64/x86_64/vm_machdep.c +++ b/sys/platform/pc64/x86_64/vm_machdep.c @@ -53,7 +53,6 @@ #include #include #include -#include #include #include @@ -253,19 +252,22 @@ cpu_lwp_exit(void) { struct thread *td = curthread; struct pcb *pcb; + npxexit(); pcb = td->td_pcb; - KKASSERT(pcb->pcb_ext == NULL); /* Some i386 functionality was dropped */ + + /* Some i386 functionality was dropped */ + KKASSERT(pcb->pcb_ext == NULL); + + /* + * disable all hardware breakpoints + */ if (pcb->pcb_flags & PCB_DBREGS) { - /* - * disable all hardware breakpoints - */ reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } td->td_gd->gd_cnt.v_swtch++; - dsched_exit_thread(td); crit_enter_quick(td); if (td->td_flags & TDF_TSLEEPQ) tsleep_remove(td); diff --git a/sys/platform/vkernel/i386/trap.c b/sys/platform/vkernel/i386/trap.c index 10477dae36..5006e84d25 100644 --- a/sys/platform/vkernel/i386/trap.c +++ b/sys/platform/vkernel/i386/trap.c @@ -256,6 +256,8 @@ recheck: /* * Post any pending signals + * + * WARNING! postsig() can exit and not return. */ if ((sig = CURSIG_TRACE(lp)) != 0) { get_mplock(); diff --git a/sys/platform/vkernel/i386/vm_machdep.c b/sys/platform/vkernel/i386/vm_machdep.c index 71dc07591f..8afdc3ed6d 100644 --- a/sys/platform/vkernel/i386/vm_machdep.c +++ b/sys/platform/vkernel/i386/vm_machdep.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include @@ -302,7 +301,6 @@ cpu_lwp_exit(void) } td->td_gd->gd_cnt.v_swtch++; - dsched_exit_thread(td); crit_enter_quick(td); if (td->td_flags & TDF_TSLEEPQ) tsleep_remove(td); diff --git a/sys/platform/vkernel64/x86_64/trap.c b/sys/platform/vkernel64/x86_64/trap.c index 118d9aa138..5c46b35ffe 100644 --- a/sys/platform/vkernel64/x86_64/trap.c +++ b/sys/platform/vkernel64/x86_64/trap.c @@ -256,6 +256,8 @@ recheck: /* * Post any pending signals + * + * WARNING! postsig() can exit and not return. */ if ((sig = CURSIG_TRACE(lp)) != 0) { get_mplock(); diff --git a/sys/platform/vkernel64/x86_64/vm_machdep.c b/sys/platform/vkernel64/x86_64/vm_machdep.c index b348472400..d8c047cf2b 100644 --- a/sys/platform/vkernel64/x86_64/vm_machdep.c +++ b/sys/platform/vkernel64/x86_64/vm_machdep.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -257,17 +256,19 @@ cpu_lwp_exit(void) struct pcb *pcb; npxexit(); pcb = td->td_pcb; - KKASSERT(pcb->pcb_ext == NULL); /* Some i386 functionality was dropped */ + + /* Some i386 functionality was dropped */ + KKASSERT(pcb->pcb_ext == NULL); + + /* + * disable all hardware breakpoints + */ if (pcb->pcb_flags & PCB_DBREGS) { - /* - * disable all hardware breakpoints - */ reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } td->td_gd->gd_cnt.v_swtch++; - dsched_exit_thread(td); crit_enter_quick(td); lwkt_deschedule_self(td); lwkt_remove_tdallq(td); -- 2.11.4.GIT