2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/module.h>
31 #include <linux/a.out.h>
32 #include <linux/interrupt.h>
33 #include <linux/delay.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
37 #include <linux/notifier.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
55 asmlinkage
extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
59 unsigned long boot_option_idle_override
= 0;
60 EXPORT_SYMBOL(boot_option_idle_override
);
63 * Powermanagement idle function, if any..
65 void (*pm_idle
)(void);
66 EXPORT_SYMBOL(pm_idle
);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state
);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
71 void idle_notifier_register(struct notifier_block
*n
)
73 atomic_notifier_chain_register(&idle_notifier
, n
);
75 EXPORT_SYMBOL_GPL(idle_notifier_register
);
77 void idle_notifier_unregister(struct notifier_block
*n
)
79 atomic_notifier_chain_unregister(&idle_notifier
, n
);
81 EXPORT_SYMBOL(idle_notifier_unregister
);
86 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
89 static void __exit_idle(void)
91 if (read_pda(isidle
) == 0)
94 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
97 /* Called from interrupts to signify idle end */
100 /* idle loop has pid 0 */
107 * We use this if we don't have any better
110 static void default_idle(void)
114 current_thread_info()->status
&= ~TS_POLLING
;
115 smp_mb__after_clear_bit();
116 while (!need_resched()) {
123 current_thread_info()->status
|= TS_POLLING
;
127 * On SMP it's slightly faster (but much more power-consuming!)
128 * to poll the ->need_resched flag instead of waiting for the
129 * cross-CPU IPI to arrive. Use this option with caution.
131 static void poll_idle (void)
141 "i" (_TIF_NEED_RESCHED
),
142 "m" (current_thread_info()->flags
));
145 void cpu_idle_wait(void)
147 unsigned int cpu
, this_cpu
= get_cpu();
150 set_cpus_allowed(current
, cpumask_of_cpu(this_cpu
));
154 for_each_online_cpu(cpu
) {
155 per_cpu(cpu_idle_state
, cpu
) = 1;
159 __get_cpu_var(cpu_idle_state
) = 0;
164 for_each_online_cpu(cpu
) {
165 if (cpu_isset(cpu
, map
) &&
166 !per_cpu(cpu_idle_state
, cpu
))
169 cpus_and(map
, map
, cpu_online_map
);
170 } while (!cpus_empty(map
));
172 EXPORT_SYMBOL_GPL(cpu_idle_wait
);
174 #ifdef CONFIG_HOTPLUG_CPU
175 DECLARE_PER_CPU(int, cpu_state
);
178 /* We halt the CPU with physical CPU hotplug */
179 static inline void play_dead(void)
185 __get_cpu_var(cpu_state
) = CPU_DEAD
;
192 static inline void play_dead(void)
196 #endif /* CONFIG_HOTPLUG_CPU */
199 * The idle thread. There's no useful work to be
200 * done, so just try to conserve power and have a
201 * low exit latency (ie sit in a loop waiting for
202 * somebody to say that they'd like to reschedule)
206 current_thread_info()->status
|= TS_POLLING
;
207 /* endless idle loop with no priority at all */
209 while (!need_resched()) {
212 if (__get_cpu_var(cpu_idle_state
))
213 __get_cpu_var(cpu_idle_state
) = 0;
219 if (cpu_is_offline(smp_processor_id()))
223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
229 preempt_enable_no_resched();
236 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
237 * which can obviate IPI to trigger checking of need_resched.
238 * We execute MONITOR against need_resched and enter optimized wait state
239 * through MWAIT. Whenever someone changes need_resched, we would be woken
240 * up from MWAIT (without an IPI).
242 static void mwait_idle(void)
246 while (!need_resched()) {
247 __monitor((void *)¤t_thread_info()->flags
, 0, 0);
255 void __cpuinit
select_idle_routine(const struct cpuinfo_x86
*c
)
258 if (cpu_has(c
, X86_FEATURE_MWAIT
)) {
260 * Skip, if setup has overridden idle.
261 * One CPU supports mwait => All CPUs supports mwait
265 printk("using mwait in idle threads.\n");
268 pm_idle
= mwait_idle
;
273 static int __init
idle_setup (char *str
)
275 if (!strncmp(str
, "poll", 4)) {
276 printk("using polling idle threads.\n");
280 boot_option_idle_override
= 1;
284 __setup("idle=", idle_setup
);
286 /* Prints also some state that isn't saved in the pt_regs */
287 void __show_regs(struct pt_regs
* regs
)
289 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
290 unsigned int fsindex
,gsindex
;
291 unsigned int ds
,cs
,es
;
295 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
296 current
->pid
, current
->comm
, print_tainted(),
297 system_utsname
.release
,
298 (int)strcspn(system_utsname
.version
, " "),
299 system_utsname
.version
);
300 printk("RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->rip
);
301 printk_address(regs
->rip
);
302 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
, regs
->rsp
,
304 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
305 regs
->rax
, regs
->rbx
, regs
->rcx
);
306 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
307 regs
->rdx
, regs
->rsi
, regs
->rdi
);
308 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
309 regs
->rbp
, regs
->r8
, regs
->r9
);
310 printk("R10: %016lx R11: %016lx R12: %016lx\n",
311 regs
->r10
, regs
->r11
, regs
->r12
);
312 printk("R13: %016lx R14: %016lx R15: %016lx\n",
313 regs
->r13
, regs
->r14
, regs
->r15
);
315 asm("movl %%ds,%0" : "=r" (ds
));
316 asm("movl %%cs,%0" : "=r" (cs
));
317 asm("movl %%es,%0" : "=r" (es
));
318 asm("movl %%fs,%0" : "=r" (fsindex
));
319 asm("movl %%gs,%0" : "=r" (gsindex
));
321 rdmsrl(MSR_FS_BASE
, fs
);
322 rdmsrl(MSR_GS_BASE
, gs
);
323 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
325 asm("movq %%cr0, %0": "=r" (cr0
));
326 asm("movq %%cr2, %0": "=r" (cr2
));
327 asm("movq %%cr3, %0": "=r" (cr3
));
328 asm("movq %%cr4, %0": "=r" (cr4
));
330 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
331 fs
,fsindex
,gs
,gsindex
,shadowgs
);
332 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
, es
, cr0
);
333 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
, cr4
);
336 void show_regs(struct pt_regs
*regs
)
338 printk("CPU %d:", smp_processor_id());
340 show_trace(NULL
, regs
, (void *)(regs
+ 1));
344 * Free current thread data structures etc..
346 void exit_thread(void)
348 struct task_struct
*me
= current
;
349 struct thread_struct
*t
= &me
->thread
;
351 if (me
->thread
.io_bitmap_ptr
) {
352 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
354 kfree(t
->io_bitmap_ptr
);
355 t
->io_bitmap_ptr
= NULL
;
356 clear_thread_flag(TIF_IO_BITMAP
);
358 * Careful, clear this in the TSS too:
360 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
361 t
->io_bitmap_max
= 0;
366 void flush_thread(void)
368 struct task_struct
*tsk
= current
;
369 struct thread_info
*t
= current_thread_info();
371 if (t
->flags
& _TIF_ABI_PENDING
) {
372 t
->flags
^= (_TIF_ABI_PENDING
| _TIF_IA32
);
373 if (t
->flags
& _TIF_IA32
)
374 current_thread_info()->status
|= TS_COMPAT
;
376 t
->flags
&= ~_TIF_DEBUG
;
378 tsk
->thread
.debugreg0
= 0;
379 tsk
->thread
.debugreg1
= 0;
380 tsk
->thread
.debugreg2
= 0;
381 tsk
->thread
.debugreg3
= 0;
382 tsk
->thread
.debugreg6
= 0;
383 tsk
->thread
.debugreg7
= 0;
384 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
386 * Forget coprocessor state..
392 void release_thread(struct task_struct
*dead_task
)
395 if (dead_task
->mm
->context
.size
) {
396 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
398 dead_task
->mm
->context
.ldt
,
399 dead_task
->mm
->context
.size
);
405 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
407 struct user_desc ud
= {
414 struct n_desc_struct
*desc
= (void *)t
->thread
.tls_array
;
416 desc
->a
= LDT_entry_a(&ud
);
417 desc
->b
= LDT_entry_b(&ud
);
420 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
422 struct desc_struct
*desc
= (void *)t
->thread
.tls_array
;
425 (((u32
)desc
->base1
) << 16) |
426 (((u32
)desc
->base2
) << 24);
430 * This gets called before we allocate a new thread and copy
431 * the current task into it.
433 void prepare_to_copy(struct task_struct
*tsk
)
438 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long rsp
,
439 unsigned long unused
,
440 struct task_struct
* p
, struct pt_regs
* regs
)
443 struct pt_regs
* childregs
;
444 struct task_struct
*me
= current
;
446 childregs
= ((struct pt_regs
*)
447 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
451 childregs
->rsp
= rsp
;
453 childregs
->rsp
= (unsigned long)childregs
;
455 p
->thread
.rsp
= (unsigned long) childregs
;
456 p
->thread
.rsp0
= (unsigned long) (childregs
+1);
457 p
->thread
.userrsp
= me
->thread
.userrsp
;
459 set_tsk_thread_flag(p
, TIF_FORK
);
461 p
->thread
.fs
= me
->thread
.fs
;
462 p
->thread
.gs
= me
->thread
.gs
;
464 asm("mov %%gs,%0" : "=m" (p
->thread
.gsindex
));
465 asm("mov %%fs,%0" : "=m" (p
->thread
.fsindex
));
466 asm("mov %%es,%0" : "=m" (p
->thread
.es
));
467 asm("mov %%ds,%0" : "=m" (p
->thread
.ds
));
469 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
470 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
471 if (!p
->thread
.io_bitmap_ptr
) {
472 p
->thread
.io_bitmap_max
= 0;
475 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
477 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
481 * Set a new TLS for the child thread?
483 if (clone_flags
& CLONE_SETTLS
) {
484 #ifdef CONFIG_IA32_EMULATION
485 if (test_thread_flag(TIF_IA32
))
486 err
= ia32_child_tls(p
, childregs
);
489 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
495 if (err
&& p
->thread
.io_bitmap_ptr
) {
496 kfree(p
->thread
.io_bitmap_ptr
);
497 p
->thread
.io_bitmap_max
= 0;
503 * This special macro can be used to load a debugging register
505 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
507 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
508 struct task_struct
*next_p
,
509 struct tss_struct
*tss
)
511 struct thread_struct
*prev
, *next
;
513 prev
= &prev_p
->thread
,
514 next
= &next_p
->thread
;
516 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
526 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
531 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
532 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
533 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
535 * Clear any possible leftover bits:
537 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
542 * switch_to(x,y) should switch tasks from x to y.
544 * This could still be optimized:
545 * - fold all the options into a flag word and test it with a single test.
546 * - could test fs/gs bitsliced
548 * Kprobes not supported here. Set the probe on schedule instead.
550 __kprobes
struct task_struct
*
551 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
553 struct thread_struct
*prev
= &prev_p
->thread
,
554 *next
= &next_p
->thread
;
555 int cpu
= smp_processor_id();
556 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
558 /* we're going to use this soon, after a few expensive things */
559 if (next_p
->fpu_counter
>5)
560 prefetch(&next
->i387
.fxsave
);
563 * Reload esp0, LDT and the page table pointer:
565 tss
->rsp0
= next
->rsp0
;
569 * This won't pick up thread selector changes, but I guess that is ok.
571 asm volatile("mov %%es,%0" : "=m" (prev
->es
));
572 if (unlikely(next
->es
| prev
->es
))
573 loadsegment(es
, next
->es
);
575 asm volatile ("mov %%ds,%0" : "=m" (prev
->ds
));
576 if (unlikely(next
->ds
| prev
->ds
))
577 loadsegment(ds
, next
->ds
);
586 asm volatile("movl %%fs,%0" : "=r" (fsindex
));
587 /* segment register != 0 always requires a reload.
588 also reload when it has changed.
589 when prev process used 64bit base always reload
590 to avoid an information leak. */
591 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
592 loadsegment(fs
, next
->fsindex
);
593 /* check if the user used a selector != 0
594 * if yes clear 64bit base, since overloaded base
595 * is always mapped to the Null selector
600 /* when next process has a 64bit base use it */
602 wrmsrl(MSR_FS_BASE
, next
->fs
);
603 prev
->fsindex
= fsindex
;
607 asm volatile("movl %%gs,%0" : "=r" (gsindex
));
608 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
609 load_gs_index(next
->gsindex
);
614 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
615 prev
->gsindex
= gsindex
;
619 * Switch the PDA and FPU contexts.
621 prev
->userrsp
= read_pda(oldrsp
);
622 write_pda(oldrsp
, next
->userrsp
);
623 write_pda(pcurrent
, next_p
);
625 /* This must be here to ensure both math_state_restore() and
626 kernel_fpu_begin() work consistently.
627 And the AMD workaround requires it to be after DS reload. */
629 write_pda(kernelstack
,
630 (unsigned long)task_stack_page(next_p
) + THREAD_SIZE
- PDA_STACKOFFSET
);
631 #ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary
, next_p
->stack_canary
);
634 * Build time only check to make sure the stack_canary is at
635 * offset 40 in the pda; this is a gcc ABI requirement
637 BUILD_BUG_ON(offsetof(struct x8664_pda
, stack_canary
) != 40);
641 * Now maybe reload the debug registers and handle I/O bitmaps
643 if (unlikely((task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW
))
644 || test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
))
645 __switch_to_xtra(prev_p
, next_p
, tss
);
647 /* If the task has used fpu the last 5 timeslices, just do a full
648 * restore of the math state immediately to avoid the trap; the
649 * chances of needing FPU soon are obviously high now
651 if (next_p
->fpu_counter
>5)
652 math_state_restore();
657 * sys_execve() executes a new program.
660 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
661 char __user
* __user
*envp
, struct pt_regs regs
)
666 filename
= getname(name
);
667 error
= PTR_ERR(filename
);
668 if (IS_ERR(filename
))
670 error
= do_execve(filename
, argv
, envp
, ®s
);
673 current
->ptrace
&= ~PT_DTRACE
;
674 task_unlock(current
);
680 void set_personality_64bit(void)
682 /* inherit personality from parent */
684 /* Make sure to be in 64bit mode */
685 clear_thread_flag(TIF_IA32
);
687 /* TBD: overwrites user setup. Should have two bits.
688 But 64bit processes have always behaved this way,
689 so it's not too bad. The main problem is just that
690 32bit childs are affected again. */
691 current
->personality
&= ~READ_IMPLIES_EXEC
;
694 asmlinkage
long sys_fork(struct pt_regs
*regs
)
696 return do_fork(SIGCHLD
, regs
->rsp
, regs
, 0, NULL
, NULL
);
700 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
701 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
705 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
709 * This is trivial, and on the face of it looks like it
710 * could equally well be done in user mode.
712 * Not so, for quite unobvious reasons - register pressure.
713 * In user mode vfork() cannot have a stack frame, and if
714 * done by calling the "clone()" system call directly, you
715 * do not have enough call-clobbered registers to hold all
716 * the information you need.
718 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
720 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->rsp
, regs
, 0,
724 unsigned long get_wchan(struct task_struct
*p
)
730 if (!p
|| p
== current
|| p
->state
==TASK_RUNNING
)
732 stack
= (unsigned long)task_stack_page(p
);
733 if (p
->thread
.rsp
< stack
|| p
->thread
.rsp
> stack
+THREAD_SIZE
)
735 fp
= *(u64
*)(p
->thread
.rsp
);
737 if (fp
< (unsigned long)stack
||
738 fp
> (unsigned long)stack
+THREAD_SIZE
)
740 rip
= *(u64
*)(fp
+8);
741 if (!in_sched_functions(rip
))
744 } while (count
++ < 16);
748 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
751 int doit
= task
== current
;
756 if (addr
>= TASK_SIZE_OF(task
))
759 /* handle small bases via the GDT because that's faster to
761 if (addr
<= 0xffffffff) {
762 set_32bit_tls(task
, GS_TLS
, addr
);
764 load_TLS(&task
->thread
, cpu
);
765 load_gs_index(GS_TLS_SEL
);
767 task
->thread
.gsindex
= GS_TLS_SEL
;
770 task
->thread
.gsindex
= 0;
771 task
->thread
.gs
= addr
;
774 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
780 /* Not strictly needed for fs, but do it for symmetry
782 if (addr
>= TASK_SIZE_OF(task
))
785 /* handle small bases via the GDT because that's faster to
787 if (addr
<= 0xffffffff) {
788 set_32bit_tls(task
, FS_TLS
, addr
);
790 load_TLS(&task
->thread
, cpu
);
791 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL
));
793 task
->thread
.fsindex
= FS_TLS_SEL
;
796 task
->thread
.fsindex
= 0;
797 task
->thread
.fs
= addr
;
799 /* set the selector to 0 to not confuse
801 asm volatile("movl %0,%%fs" :: "r" (0));
802 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
809 if (task
->thread
.fsindex
== FS_TLS_SEL
)
810 base
= read_32bit_tls(task
, FS_TLS
);
812 rdmsrl(MSR_FS_BASE
, base
);
814 base
= task
->thread
.fs
;
815 ret
= put_user(base
, (unsigned long __user
*)addr
);
821 if (task
->thread
.gsindex
== GS_TLS_SEL
)
822 base
= read_32bit_tls(task
, GS_TLS
);
824 asm("movl %%gs,%0" : "=r" (gsindex
));
826 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
828 base
= task
->thread
.gs
;
831 base
= task
->thread
.gs
;
832 ret
= put_user(base
, (unsigned long __user
*)addr
);
844 long sys_arch_prctl(int code
, unsigned long addr
)
846 return do_arch_prctl(current
, code
, addr
);
850 * Capture the user space registers if the task is not running (in user space)
852 int dump_task_regs(struct task_struct
*tsk
, elf_gregset_t
*regs
)
854 struct pt_regs
*pp
, ptregs
;
856 pp
= task_pt_regs(tsk
);
862 elf_core_copy_regs(regs
, &ptregs
);
867 unsigned long arch_align_stack(unsigned long sp
)
869 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
870 sp
-= get_random_int() % 8192;