2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage
extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct
*, current_task
) = &init_task
;
62 EXPORT_PER_CPU_SYMBOL(current_task
);
64 DEFINE_PER_CPU(unsigned long, old_rsp
);
65 static DEFINE_PER_CPU(unsigned char, is_idle
);
67 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
71 void idle_notifier_register(struct notifier_block
*n
)
73 atomic_notifier_chain_register(&idle_notifier
, n
);
75 EXPORT_SYMBOL_GPL(idle_notifier_register
);
77 void idle_notifier_unregister(struct notifier_block
*n
)
79 atomic_notifier_chain_unregister(&idle_notifier
, n
);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister
);
85 percpu_write(is_idle
, 1);
86 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle
) == 0)
93 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 static inline void play_dead(void)
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status
|= TS_POLLING
;
123 * If we're the non-boot CPU, nothing set the PDA stack
124 * canary up for us - and if we are the boot CPU we have
125 * a 0 stack canary. This is a good place for updating
126 * it, as we wont ever return from this function (so the
127 * invalid canaries already on the stack wont ever
130 boot_init_stack_canary();
132 /* endless idle loop with no priority at all */
134 tick_nohz_stop_sched_tick(1);
135 while (!need_resched()) {
139 if (cpu_is_offline(smp_processor_id()))
142 * Idle routines should keep interrupts disabled
143 * from here on, until they go to idle.
144 * Otherwise, idle callbacks can misfire.
148 /* Don't trace irqs off for idle */
149 stop_critical_timings();
151 start_critical_timings();
152 /* In many cases the interrupt that ended idle
153 has already called exit_idle. But some idle
154 loops can be woken up without interrupt. */
158 tick_nohz_restart_sched_tick();
159 preempt_enable_no_resched();
165 /* Prints also some state that isn't saved in the pt_regs */
166 void __show_regs(struct pt_regs
*regs
, int all
)
168 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
169 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
170 unsigned int fsindex
, gsindex
;
171 unsigned int ds
, cs
, es
;
175 printk(KERN_INFO
"Pid: %d, comm: %.20s %s %s %.*s\n",
176 current
->pid
, current
->comm
, print_tainted(),
177 init_utsname()->release
,
178 (int)strcspn(init_utsname()->version
, " "),
179 init_utsname()->version
);
180 printk(KERN_INFO
"RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
181 printk_address(regs
->ip
, 1);
182 printk(KERN_INFO
"RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
,
183 regs
->sp
, regs
->flags
);
184 printk(KERN_INFO
"RAX: %016lx RBX: %016lx RCX: %016lx\n",
185 regs
->ax
, regs
->bx
, regs
->cx
);
186 printk(KERN_INFO
"RDX: %016lx RSI: %016lx RDI: %016lx\n",
187 regs
->dx
, regs
->si
, regs
->di
);
188 printk(KERN_INFO
"RBP: %016lx R08: %016lx R09: %016lx\n",
189 regs
->bp
, regs
->r8
, regs
->r9
);
190 printk(KERN_INFO
"R10: %016lx R11: %016lx R12: %016lx\n",
191 regs
->r10
, regs
->r11
, regs
->r12
);
192 printk(KERN_INFO
"R13: %016lx R14: %016lx R15: %016lx\n",
193 regs
->r13
, regs
->r14
, regs
->r15
);
195 asm("movl %%ds,%0" : "=r" (ds
));
196 asm("movl %%cs,%0" : "=r" (cs
));
197 asm("movl %%es,%0" : "=r" (es
));
198 asm("movl %%fs,%0" : "=r" (fsindex
));
199 asm("movl %%gs,%0" : "=r" (gsindex
));
201 rdmsrl(MSR_FS_BASE
, fs
);
202 rdmsrl(MSR_GS_BASE
, gs
);
203 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
213 printk(KERN_INFO
"FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
214 fs
, fsindex
, gs
, gsindex
, shadowgs
);
215 printk(KERN_INFO
"CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
,
217 printk(KERN_INFO
"CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
,
223 printk(KERN_INFO
"DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
227 printk(KERN_INFO
"DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
230 void show_regs(struct pt_regs
*regs
)
232 printk(KERN_INFO
"CPU %d:", smp_processor_id());
233 __show_regs(regs
, 1);
234 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
238 * Free current thread data structures etc..
240 void exit_thread(void)
242 struct task_struct
*me
= current
;
243 struct thread_struct
*t
= &me
->thread
;
245 if (me
->thread
.io_bitmap_ptr
) {
246 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
248 kfree(t
->io_bitmap_ptr
);
249 t
->io_bitmap_ptr
= NULL
;
250 clear_thread_flag(TIF_IO_BITMAP
);
252 * Careful, clear this in the TSS too:
254 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
255 t
->io_bitmap_max
= 0;
259 ds_exit_thread(current
);
262 void flush_thread(void)
264 struct task_struct
*tsk
= current
;
266 if (test_tsk_thread_flag(tsk
, TIF_ABI_PENDING
)) {
267 clear_tsk_thread_flag(tsk
, TIF_ABI_PENDING
);
268 if (test_tsk_thread_flag(tsk
, TIF_IA32
)) {
269 clear_tsk_thread_flag(tsk
, TIF_IA32
);
271 set_tsk_thread_flag(tsk
, TIF_IA32
);
272 current_thread_info()->status
|= TS_COMPAT
;
275 clear_tsk_thread_flag(tsk
, TIF_DEBUG
);
277 tsk
->thread
.debugreg0
= 0;
278 tsk
->thread
.debugreg1
= 0;
279 tsk
->thread
.debugreg2
= 0;
280 tsk
->thread
.debugreg3
= 0;
281 tsk
->thread
.debugreg6
= 0;
282 tsk
->thread
.debugreg7
= 0;
283 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
285 * Forget coprocessor state..
287 tsk
->fpu_counter
= 0;
292 void release_thread(struct task_struct
*dead_task
)
295 if (dead_task
->mm
->context
.size
) {
296 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
298 dead_task
->mm
->context
.ldt
,
299 dead_task
->mm
->context
.size
);
305 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
307 struct user_desc ud
= {
314 struct desc_struct
*desc
= t
->thread
.tls_array
;
319 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
321 return get_desc_base(&t
->thread
.tls_array
[tls
]);
325 * This gets called before we allocate a new thread and copy
326 * the current task into it.
328 void prepare_to_copy(struct task_struct
*tsk
)
333 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
334 unsigned long unused
,
335 struct task_struct
*p
, struct pt_regs
*regs
)
338 struct pt_regs
*childregs
;
339 struct task_struct
*me
= current
;
341 childregs
= ((struct pt_regs
*)
342 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
348 childregs
->sp
= (unsigned long)childregs
;
350 p
->thread
.sp
= (unsigned long) childregs
;
351 p
->thread
.sp0
= (unsigned long) (childregs
+1);
352 p
->thread
.usersp
= me
->thread
.usersp
;
354 set_tsk_thread_flag(p
, TIF_FORK
);
356 p
->thread
.fs
= me
->thread
.fs
;
357 p
->thread
.gs
= me
->thread
.gs
;
359 savesegment(gs
, p
->thread
.gsindex
);
360 savesegment(fs
, p
->thread
.fsindex
);
361 savesegment(es
, p
->thread
.es
);
362 savesegment(ds
, p
->thread
.ds
);
364 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
365 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
366 if (!p
->thread
.io_bitmap_ptr
) {
367 p
->thread
.io_bitmap_max
= 0;
370 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
372 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
376 * Set a new TLS for the child thread?
378 if (clone_flags
& CLONE_SETTLS
) {
379 #ifdef CONFIG_IA32_EMULATION
380 if (test_thread_flag(TIF_IA32
))
381 err
= do_set_thread_area(p
, -1,
382 (struct user_desc __user
*)childregs
->si
, 0);
385 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
390 ds_copy_thread(p
, me
);
392 clear_tsk_thread_flag(p
, TIF_DEBUGCTLMSR
);
393 p
->thread
.debugctlmsr
= 0;
397 if (err
&& p
->thread
.io_bitmap_ptr
) {
398 kfree(p
->thread
.io_bitmap_ptr
);
399 p
->thread
.io_bitmap_max
= 0;
405 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
413 percpu_write(old_rsp
, new_sp
);
414 regs
->cs
= __USER_CS
;
415 regs
->ss
= __USER_DS
;
419 * Free the old FP and other extended state
421 free_thread_xstate(current
);
423 EXPORT_SYMBOL_GPL(start_thread
);
425 static void hard_disable_TSC(void)
427 write_cr4(read_cr4() | X86_CR4_TSD
);
430 void disable_TSC(void)
433 if (!test_and_set_thread_flag(TIF_NOTSC
))
435 * Must flip the CPU state synchronously with
436 * TIF_NOTSC in the current running context.
442 static void hard_enable_TSC(void)
444 write_cr4(read_cr4() & ~X86_CR4_TSD
);
447 static void enable_TSC(void)
450 if (test_and_clear_thread_flag(TIF_NOTSC
))
452 * Must flip the CPU state synchronously with
453 * TIF_NOTSC in the current running context.
459 int get_tsc_mode(unsigned long adr
)
463 if (test_thread_flag(TIF_NOTSC
))
464 val
= PR_TSC_SIGSEGV
;
468 return put_user(val
, (unsigned int __user
*)adr
);
471 int set_tsc_mode(unsigned int val
)
473 if (val
== PR_TSC_SIGSEGV
)
475 else if (val
== PR_TSC_ENABLE
)
484 * This special macro can be used to load a debugging register
486 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
488 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
489 struct task_struct
*next_p
,
490 struct tss_struct
*tss
)
492 struct thread_struct
*prev
, *next
;
494 prev
= &prev_p
->thread
,
495 next
= &next_p
->thread
;
497 if (test_tsk_thread_flag(next_p
, TIF_DS_AREA_MSR
) ||
498 test_tsk_thread_flag(prev_p
, TIF_DS_AREA_MSR
))
499 ds_switch_to(prev_p
, next_p
);
500 else if (next
->debugctlmsr
!= prev
->debugctlmsr
)
501 update_debugctlmsr(next
->debugctlmsr
);
503 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
513 if (test_tsk_thread_flag(prev_p
, TIF_NOTSC
) ^
514 test_tsk_thread_flag(next_p
, TIF_NOTSC
)) {
515 /* prev and next are different */
516 if (test_tsk_thread_flag(next_p
, TIF_NOTSC
))
522 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
524 * Copy the relevant range of the IO bitmap.
525 * Normally this is 128 bytes or less:
527 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
528 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
529 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
531 * Clear any possible leftover bits:
533 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
538 * switch_to(x,y) should switch tasks from x to y.
540 * This could still be optimized:
541 * - fold all the options into a flag word and test it with a single test.
542 * - could test fs/gs bitsliced
544 * Kprobes not supported here. Set the probe on schedule instead.
545 * Function graph tracer not supported too.
547 __notrace_funcgraph
struct task_struct
*
548 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
550 struct thread_struct
*prev
= &prev_p
->thread
;
551 struct thread_struct
*next
= &next_p
->thread
;
552 int cpu
= smp_processor_id();
553 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
554 unsigned fsindex
, gsindex
;
556 /* we're going to use this soon, after a few expensive things */
557 if (next_p
->fpu_counter
> 5)
558 prefetch(next
->xstate
);
561 * Reload esp0, LDT and the page table pointer:
567 * This won't pick up thread selector changes, but I guess that is ok.
569 savesegment(es
, prev
->es
);
570 if (unlikely(next
->es
| prev
->es
))
571 loadsegment(es
, next
->es
);
573 savesegment(ds
, prev
->ds
);
574 if (unlikely(next
->ds
| prev
->ds
))
575 loadsegment(ds
, next
->ds
);
578 /* We must save %fs and %gs before load_TLS() because
579 * %fs and %gs may be cleared by load_TLS().
581 * (e.g. xen_load_tls())
583 savesegment(fs
, fsindex
);
584 savesegment(gs
, gsindex
);
589 * Leave lazy mode, flushing any hypercalls made here.
590 * This must be done before restoring TLS segments so
591 * the GDT and LDT are properly updated, and must be
592 * done before math_state_restore, so the TS bit is up
595 arch_leave_lazy_cpu_mode();
600 * Segment register != 0 always requires a reload. Also
601 * reload when it has changed. When prev process used 64bit
602 * base always reload to avoid an information leak.
604 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
605 loadsegment(fs
, next
->fsindex
);
607 * Check if the user used a selector != 0; if yes
608 * clear 64bit base, since overloaded base is always
609 * mapped to the Null selector
614 /* when next process has a 64bit base use it */
616 wrmsrl(MSR_FS_BASE
, next
->fs
);
617 prev
->fsindex
= fsindex
;
619 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
620 load_gs_index(next
->gsindex
);
625 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
626 prev
->gsindex
= gsindex
;
628 /* Must be after DS reload */
632 * Switch the PDA and FPU contexts.
634 prev
->usersp
= percpu_read(old_rsp
);
635 percpu_write(old_rsp
, next
->usersp
);
636 percpu_write(current_task
, next_p
);
638 percpu_write(kernel_stack
,
639 (unsigned long)task_stack_page(next_p
) +
640 THREAD_SIZE
- KERNEL_STACK_OFFSET
);
643 * Now maybe reload the debug registers and handle I/O bitmaps
645 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
646 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
647 __switch_to_xtra(prev_p
, next_p
, tss
);
649 /* If the task has used fpu the last 5 timeslices, just do a full
650 * restore of the math state immediately to avoid the trap; the
651 * chances of needing FPU soon are obviously high now
653 * tsk_used_math() checks prevent calling math_state_restore(),
654 * which can sleep in the case of !tsk_used_math()
656 if (tsk_used_math(next_p
) && next_p
->fpu_counter
> 5)
657 math_state_restore();
662 * sys_execve() executes a new program.
665 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
666 char __user
* __user
*envp
, struct pt_regs
*regs
)
671 filename
= getname(name
);
672 error
= PTR_ERR(filename
);
673 if (IS_ERR(filename
))
675 error
= do_execve(filename
, argv
, envp
, regs
);
680 void set_personality_64bit(void)
682 /* inherit personality from parent */
684 /* Make sure to be in 64bit mode */
685 clear_thread_flag(TIF_IA32
);
687 /* TBD: overwrites user setup. Should have two bits.
688 But 64bit processes have always behaved this way,
689 so it's not too bad. The main problem is just that
690 32bit childs are affected again. */
691 current
->personality
&= ~READ_IMPLIES_EXEC
;
694 asmlinkage
long sys_fork(struct pt_regs
*regs
)
696 return do_fork(SIGCHLD
, regs
->sp
, regs
, 0, NULL
, NULL
);
700 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
701 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
705 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
709 * This is trivial, and on the face of it looks like it
710 * could equally well be done in user mode.
712 * Not so, for quite unobvious reasons - register pressure.
713 * In user mode vfork() cannot have a stack frame, and if
714 * done by calling the "clone()" system call directly, you
715 * do not have enough call-clobbered registers to hold all
716 * the information you need.
718 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
720 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->sp
, regs
, 0,
724 unsigned long get_wchan(struct task_struct
*p
)
730 if (!p
|| p
== current
|| p
->state
== TASK_RUNNING
)
732 stack
= (unsigned long)task_stack_page(p
);
733 if (p
->thread
.sp
< stack
|| p
->thread
.sp
>= stack
+THREAD_SIZE
)
735 fp
= *(u64
*)(p
->thread
.sp
);
737 if (fp
< (unsigned long)stack
||
738 fp
>= (unsigned long)stack
+THREAD_SIZE
)
741 if (!in_sched_functions(ip
))
744 } while (count
++ < 16);
748 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
751 int doit
= task
== current
;
756 if (addr
>= TASK_SIZE_OF(task
))
759 /* handle small bases via the GDT because that's faster to
761 if (addr
<= 0xffffffff) {
762 set_32bit_tls(task
, GS_TLS
, addr
);
764 load_TLS(&task
->thread
, cpu
);
765 load_gs_index(GS_TLS_SEL
);
767 task
->thread
.gsindex
= GS_TLS_SEL
;
770 task
->thread
.gsindex
= 0;
771 task
->thread
.gs
= addr
;
774 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
780 /* Not strictly needed for fs, but do it for symmetry
782 if (addr
>= TASK_SIZE_OF(task
))
785 /* handle small bases via the GDT because that's faster to
787 if (addr
<= 0xffffffff) {
788 set_32bit_tls(task
, FS_TLS
, addr
);
790 load_TLS(&task
->thread
, cpu
);
791 loadsegment(fs
, FS_TLS_SEL
);
793 task
->thread
.fsindex
= FS_TLS_SEL
;
796 task
->thread
.fsindex
= 0;
797 task
->thread
.fs
= addr
;
799 /* set the selector to 0 to not confuse
802 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
809 if (task
->thread
.fsindex
== FS_TLS_SEL
)
810 base
= read_32bit_tls(task
, FS_TLS
);
812 rdmsrl(MSR_FS_BASE
, base
);
814 base
= task
->thread
.fs
;
815 ret
= put_user(base
, (unsigned long __user
*)addr
);
821 if (task
->thread
.gsindex
== GS_TLS_SEL
)
822 base
= read_32bit_tls(task
, GS_TLS
);
824 savesegment(gs
, gsindex
);
826 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
828 base
= task
->thread
.gs
;
830 base
= task
->thread
.gs
;
831 ret
= put_user(base
, (unsigned long __user
*)addr
);
843 long sys_arch_prctl(int code
, unsigned long addr
)
845 return do_arch_prctl(current
, code
, addr
);
848 unsigned long arch_align_stack(unsigned long sp
)
850 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
851 sp
-= get_random_int() % 8192;
855 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
857 unsigned long range_end
= mm
->brk
+ 0x02000000;
858 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;