2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage
extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
61 void idle_notifier_register(struct notifier_block
*n
)
63 atomic_notifier_chain_register(&idle_notifier
, n
);
69 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
72 static void __exit_idle(void)
74 if (test_and_clear_bit_pda(0, isidle
) == 0)
76 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
79 /* Called from interrupts to signify idle end */
82 /* idle loop has pid 0 */
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state
);
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
98 __get_cpu_var(cpu_state
) = CPU_DEAD
;
101 /* mask all interrupts, flush any and all caches, and halt */
105 static inline void play_dead(void)
109 #endif /* CONFIG_HOTPLUG_CPU */
112 * The idle thread. There's no useful work to be
113 * done, so just try to conserve power and have a
114 * low exit latency (ie sit in a loop waiting for
115 * somebody to say that they'd like to reschedule)
119 current_thread_info()->status
|= TS_POLLING
;
120 /* endless idle loop with no priority at all */
122 tick_nohz_stop_sched_tick(1);
123 while (!need_resched()) {
127 if (cpu_is_offline(smp_processor_id()))
130 * Idle routines should keep interrupts disabled
131 * from here on, until they go to idle.
132 * Otherwise, idle callbacks can misfire.
136 /* Don't trace irqs off for idle */
137 stop_critical_timings();
139 start_critical_timings();
140 /* In many cases the interrupt that ended idle
141 has already called exit_idle. But some idle
142 loops can be woken up without interrupt. */
146 tick_nohz_restart_sched_tick();
147 preempt_enable_no_resched();
153 /* Prints also some state that isn't saved in the pt_regs */
154 void __show_regs(struct pt_regs
* regs
)
156 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
157 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
158 unsigned int fsindex
, gsindex
;
159 unsigned int ds
, cs
, es
;
163 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
164 current
->pid
, current
->comm
, print_tainted(),
165 init_utsname()->release
,
166 (int)strcspn(init_utsname()->version
, " "),
167 init_utsname()->version
);
168 printk("RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
169 printk_address(regs
->ip
, 1);
170 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
, regs
->sp
,
172 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 regs
->ax
, regs
->bx
, regs
->cx
);
174 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 regs
->dx
, regs
->si
, regs
->di
);
176 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
177 regs
->bp
, regs
->r8
, regs
->r9
);
178 printk("R10: %016lx R11: %016lx R12: %016lx\n",
179 regs
->r10
, regs
->r11
, regs
->r12
);
180 printk("R13: %016lx R14: %016lx R15: %016lx\n",
181 regs
->r13
, regs
->r14
, regs
->r15
);
183 asm("movl %%ds,%0" : "=r" (ds
));
184 asm("movl %%cs,%0" : "=r" (cs
));
185 asm("movl %%es,%0" : "=r" (es
));
186 asm("movl %%fs,%0" : "=r" (fsindex
));
187 asm("movl %%gs,%0" : "=r" (gsindex
));
189 rdmsrl(MSR_FS_BASE
, fs
);
190 rdmsrl(MSR_GS_BASE
, gs
);
191 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
198 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
199 fs
,fsindex
,gs
,gsindex
,shadowgs
);
200 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
, es
, cr0
);
201 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
, cr4
);
206 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
210 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
213 void show_regs(struct pt_regs
*regs
)
215 printk("CPU %d:", smp_processor_id());
217 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
221 * Free current thread data structures etc..
223 void exit_thread(void)
225 struct task_struct
*me
= current
;
226 struct thread_struct
*t
= &me
->thread
;
228 if (me
->thread
.io_bitmap_ptr
) {
229 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
231 kfree(t
->io_bitmap_ptr
);
232 t
->io_bitmap_ptr
= NULL
;
233 clear_thread_flag(TIF_IO_BITMAP
);
235 * Careful, clear this in the TSS too:
237 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
238 t
->io_bitmap_max
= 0;
243 void flush_thread(void)
245 struct task_struct
*tsk
= current
;
247 if (test_tsk_thread_flag(tsk
, TIF_ABI_PENDING
)) {
248 clear_tsk_thread_flag(tsk
, TIF_ABI_PENDING
);
249 if (test_tsk_thread_flag(tsk
, TIF_IA32
)) {
250 clear_tsk_thread_flag(tsk
, TIF_IA32
);
252 set_tsk_thread_flag(tsk
, TIF_IA32
);
253 current_thread_info()->status
|= TS_COMPAT
;
256 clear_tsk_thread_flag(tsk
, TIF_DEBUG
);
258 tsk
->thread
.debugreg0
= 0;
259 tsk
->thread
.debugreg1
= 0;
260 tsk
->thread
.debugreg2
= 0;
261 tsk
->thread
.debugreg3
= 0;
262 tsk
->thread
.debugreg6
= 0;
263 tsk
->thread
.debugreg7
= 0;
264 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
266 * Forget coprocessor state..
268 tsk
->fpu_counter
= 0;
273 void release_thread(struct task_struct
*dead_task
)
276 if (dead_task
->mm
->context
.size
) {
277 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
279 dead_task
->mm
->context
.ldt
,
280 dead_task
->mm
->context
.size
);
286 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
288 struct user_desc ud
= {
295 struct desc_struct
*desc
= t
->thread
.tls_array
;
300 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
302 return get_desc_base(&t
->thread
.tls_array
[tls
]);
306 * This gets called before we allocate a new thread and copy
307 * the current task into it.
309 void prepare_to_copy(struct task_struct
*tsk
)
314 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
315 unsigned long unused
,
316 struct task_struct
* p
, struct pt_regs
* regs
)
319 struct pt_regs
* childregs
;
320 struct task_struct
*me
= current
;
322 childregs
= ((struct pt_regs
*)
323 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
329 childregs
->sp
= (unsigned long)childregs
;
331 p
->thread
.sp
= (unsigned long) childregs
;
332 p
->thread
.sp0
= (unsigned long) (childregs
+1);
333 p
->thread
.usersp
= me
->thread
.usersp
;
335 set_tsk_thread_flag(p
, TIF_FORK
);
337 p
->thread
.fs
= me
->thread
.fs
;
338 p
->thread
.gs
= me
->thread
.gs
;
340 savesegment(gs
, p
->thread
.gsindex
);
341 savesegment(fs
, p
->thread
.fsindex
);
342 savesegment(es
, p
->thread
.es
);
343 savesegment(ds
, p
->thread
.ds
);
345 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
346 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
347 if (!p
->thread
.io_bitmap_ptr
) {
348 p
->thread
.io_bitmap_max
= 0;
351 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
353 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
357 * Set a new TLS for the child thread?
359 if (clone_flags
& CLONE_SETTLS
) {
360 #ifdef CONFIG_IA32_EMULATION
361 if (test_thread_flag(TIF_IA32
))
362 err
= do_set_thread_area(p
, -1,
363 (struct user_desc __user
*)childregs
->si
, 0);
366 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
372 if (err
&& p
->thread
.io_bitmap_ptr
) {
373 kfree(p
->thread
.io_bitmap_ptr
);
374 p
->thread
.io_bitmap_max
= 0;
380 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
388 write_pda(oldrsp
, new_sp
);
389 regs
->cs
= __USER_CS
;
390 regs
->ss
= __USER_DS
;
394 * Free the old FP and other extended state
396 free_thread_xstate(current
);
398 EXPORT_SYMBOL_GPL(start_thread
);
400 static void hard_disable_TSC(void)
402 write_cr4(read_cr4() | X86_CR4_TSD
);
405 void disable_TSC(void)
408 if (!test_and_set_thread_flag(TIF_NOTSC
))
410 * Must flip the CPU state synchronously with
411 * TIF_NOTSC in the current running context.
417 static void hard_enable_TSC(void)
419 write_cr4(read_cr4() & ~X86_CR4_TSD
);
422 static void enable_TSC(void)
425 if (test_and_clear_thread_flag(TIF_NOTSC
))
427 * Must flip the CPU state synchronously with
428 * TIF_NOTSC in the current running context.
434 int get_tsc_mode(unsigned long adr
)
438 if (test_thread_flag(TIF_NOTSC
))
439 val
= PR_TSC_SIGSEGV
;
443 return put_user(val
, (unsigned int __user
*)adr
);
446 int set_tsc_mode(unsigned int val
)
448 if (val
== PR_TSC_SIGSEGV
)
450 else if (val
== PR_TSC_ENABLE
)
459 * This special macro can be used to load a debugging register
461 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
463 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
464 struct task_struct
*next_p
,
465 struct tss_struct
*tss
)
467 struct thread_struct
*prev
, *next
;
468 unsigned long debugctl
;
470 prev
= &prev_p
->thread
,
471 next
= &next_p
->thread
;
473 debugctl
= prev
->debugctlmsr
;
474 if (next
->ds_area_msr
!= prev
->ds_area_msr
) {
475 /* we clear debugctl to make sure DS
476 * is not in use when we change it */
478 update_debugctlmsr(0);
479 wrmsrl(MSR_IA32_DS_AREA
, next
->ds_area_msr
);
482 if (next
->debugctlmsr
!= debugctl
)
483 update_debugctlmsr(next
->debugctlmsr
);
485 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
495 if (test_tsk_thread_flag(prev_p
, TIF_NOTSC
) ^
496 test_tsk_thread_flag(next_p
, TIF_NOTSC
)) {
497 /* prev and next are different */
498 if (test_tsk_thread_flag(next_p
, TIF_NOTSC
))
504 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
506 * Copy the relevant range of the IO bitmap.
507 * Normally this is 128 bytes or less:
509 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
510 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
511 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
513 * Clear any possible leftover bits:
515 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
519 if (test_tsk_thread_flag(prev_p
, TIF_BTS_TRACE_TS
))
520 ptrace_bts_take_timestamp(prev_p
, BTS_TASK_DEPARTS
);
522 if (test_tsk_thread_flag(next_p
, TIF_BTS_TRACE_TS
))
523 ptrace_bts_take_timestamp(next_p
, BTS_TASK_ARRIVES
);
528 * switch_to(x,y) should switch tasks from x to y.
530 * This could still be optimized:
531 * - fold all the options into a flag word and test it with a single test.
532 * - could test fs/gs bitsliced
534 * Kprobes not supported here. Set the probe on schedule instead.
537 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
539 struct thread_struct
*prev
= &prev_p
->thread
;
540 struct thread_struct
*next
= &next_p
->thread
;
541 int cpu
= smp_processor_id();
542 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
543 unsigned fsindex
, gsindex
;
545 /* we're going to use this soon, after a few expensive things */
546 if (next_p
->fpu_counter
>5)
547 prefetch(next
->xstate
);
550 * Reload esp0, LDT and the page table pointer:
556 * This won't pick up thread selector changes, but I guess that is ok.
558 savesegment(es
, prev
->es
);
559 if (unlikely(next
->es
| prev
->es
))
560 loadsegment(es
, next
->es
);
562 savesegment(ds
, prev
->ds
);
563 if (unlikely(next
->ds
| prev
->ds
))
564 loadsegment(ds
, next
->ds
);
567 /* We must save %fs and %gs before load_TLS() because
568 * %fs and %gs may be cleared by load_TLS().
570 * (e.g. xen_load_tls())
572 savesegment(fs
, fsindex
);
573 savesegment(gs
, gsindex
);
578 * Leave lazy mode, flushing any hypercalls made here.
579 * This must be done before restoring TLS segments so
580 * the GDT and LDT are properly updated, and must be
581 * done before math_state_restore, so the TS bit is up
584 arch_leave_lazy_cpu_mode();
589 * Segment register != 0 always requires a reload. Also
590 * reload when it has changed. When prev process used 64bit
591 * base always reload to avoid an information leak.
593 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
594 loadsegment(fs
, next
->fsindex
);
596 * Check if the user used a selector != 0; if yes
597 * clear 64bit base, since overloaded base is always
598 * mapped to the Null selector
603 /* when next process has a 64bit base use it */
605 wrmsrl(MSR_FS_BASE
, next
->fs
);
606 prev
->fsindex
= fsindex
;
608 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
609 load_gs_index(next
->gsindex
);
614 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
615 prev
->gsindex
= gsindex
;
617 /* Must be after DS reload */
621 * Switch the PDA and FPU contexts.
623 prev
->usersp
= read_pda(oldrsp
);
624 write_pda(oldrsp
, next
->usersp
);
625 write_pda(pcurrent
, next_p
);
627 write_pda(kernelstack
,
628 (unsigned long)task_stack_page(next_p
) +
629 THREAD_SIZE
- PDA_STACKOFFSET
);
630 #ifdef CONFIG_CC_STACKPROTECTOR
631 write_pda(stack_canary
, next_p
->stack_canary
);
633 * Build time only check to make sure the stack_canary is at
634 * offset 40 in the pda; this is a gcc ABI requirement
636 BUILD_BUG_ON(offsetof(struct x8664_pda
, stack_canary
) != 40);
640 * Now maybe reload the debug registers and handle I/O bitmaps
642 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
643 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
644 __switch_to_xtra(prev_p
, next_p
, tss
);
646 /* If the task has used fpu the last 5 timeslices, just do a full
647 * restore of the math state immediately to avoid the trap; the
648 * chances of needing FPU soon are obviously high now
650 * tsk_used_math() checks prevent calling math_state_restore(),
651 * which can sleep in the case of !tsk_used_math()
653 if (tsk_used_math(next_p
) && next_p
->fpu_counter
> 5)
654 math_state_restore();
659 * sys_execve() executes a new program.
662 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
663 char __user
* __user
*envp
, struct pt_regs
*regs
)
668 filename
= getname(name
);
669 error
= PTR_ERR(filename
);
670 if (IS_ERR(filename
))
672 error
= do_execve(filename
, argv
, envp
, regs
);
677 void set_personality_64bit(void)
679 /* inherit personality from parent */
681 /* Make sure to be in 64bit mode */
682 clear_thread_flag(TIF_IA32
);
684 /* TBD: overwrites user setup. Should have two bits.
685 But 64bit processes have always behaved this way,
686 so it's not too bad. The main problem is just that
687 32bit childs are affected again. */
688 current
->personality
&= ~READ_IMPLIES_EXEC
;
691 asmlinkage
long sys_fork(struct pt_regs
*regs
)
693 return do_fork(SIGCHLD
, regs
->sp
, regs
, 0, NULL
, NULL
);
697 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
698 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
702 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
706 * This is trivial, and on the face of it looks like it
707 * could equally well be done in user mode.
709 * Not so, for quite unobvious reasons - register pressure.
710 * In user mode vfork() cannot have a stack frame, and if
711 * done by calling the "clone()" system call directly, you
712 * do not have enough call-clobbered registers to hold all
713 * the information you need.
715 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
717 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->sp
, regs
, 0,
721 unsigned long get_wchan(struct task_struct
*p
)
727 if (!p
|| p
== current
|| p
->state
==TASK_RUNNING
)
729 stack
= (unsigned long)task_stack_page(p
);
730 if (p
->thread
.sp
< stack
|| p
->thread
.sp
> stack
+THREAD_SIZE
)
732 fp
= *(u64
*)(p
->thread
.sp
);
734 if (fp
< (unsigned long)stack
||
735 fp
> (unsigned long)stack
+THREAD_SIZE
)
738 if (!in_sched_functions(ip
))
741 } while (count
++ < 16);
745 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
748 int doit
= task
== current
;
753 if (addr
>= TASK_SIZE_OF(task
))
756 /* handle small bases via the GDT because that's faster to
758 if (addr
<= 0xffffffff) {
759 set_32bit_tls(task
, GS_TLS
, addr
);
761 load_TLS(&task
->thread
, cpu
);
762 load_gs_index(GS_TLS_SEL
);
764 task
->thread
.gsindex
= GS_TLS_SEL
;
767 task
->thread
.gsindex
= 0;
768 task
->thread
.gs
= addr
;
771 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
777 /* Not strictly needed for fs, but do it for symmetry
779 if (addr
>= TASK_SIZE_OF(task
))
782 /* handle small bases via the GDT because that's faster to
784 if (addr
<= 0xffffffff) {
785 set_32bit_tls(task
, FS_TLS
, addr
);
787 load_TLS(&task
->thread
, cpu
);
788 loadsegment(fs
, FS_TLS_SEL
);
790 task
->thread
.fsindex
= FS_TLS_SEL
;
793 task
->thread
.fsindex
= 0;
794 task
->thread
.fs
= addr
;
796 /* set the selector to 0 to not confuse
799 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
806 if (task
->thread
.fsindex
== FS_TLS_SEL
)
807 base
= read_32bit_tls(task
, FS_TLS
);
809 rdmsrl(MSR_FS_BASE
, base
);
811 base
= task
->thread
.fs
;
812 ret
= put_user(base
, (unsigned long __user
*)addr
);
818 if (task
->thread
.gsindex
== GS_TLS_SEL
)
819 base
= read_32bit_tls(task
, GS_TLS
);
821 savesegment(gs
, gsindex
);
823 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
825 base
= task
->thread
.gs
;
828 base
= task
->thread
.gs
;
829 ret
= put_user(base
, (unsigned long __user
*)addr
);
841 long sys_arch_prctl(int code
, unsigned long addr
)
843 return do_arch_prctl(current
, code
, addr
);
846 unsigned long arch_align_stack(unsigned long sp
)
848 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
849 sp
-= get_random_int() % 8192;
853 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
855 unsigned long range_end
= mm
->brk
+ 0x02000000;
856 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;