2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage
extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct
*, current_task
) = &init_task
;
62 EXPORT_PER_CPU_SYMBOL(current_task
);
64 DEFINE_PER_CPU(unsigned long, old_rsp
);
65 static DEFINE_PER_CPU(unsigned char, is_idle
);
67 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
71 void idle_notifier_register(struct notifier_block
*n
)
73 atomic_notifier_chain_register(&idle_notifier
, n
);
75 EXPORT_SYMBOL_GPL(idle_notifier_register
);
77 void idle_notifier_unregister(struct notifier_block
*n
)
79 atomic_notifier_chain_unregister(&idle_notifier
, n
);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister
);
85 percpu_write(is_idle
, 1);
86 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle
) == 0)
93 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 static inline void play_dead(void)
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status
|= TS_POLLING
;
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
129 boot_init_stack_canary();
131 /* endless idle loop with no priority at all */
133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) {
138 if (cpu_is_offline(smp_processor_id()))
141 * Idle routines should keep interrupts disabled
142 * from here on, until they go to idle.
143 * Otherwise, idle callbacks can misfire.
147 /* Don't trace irqs off for idle */
148 stop_critical_timings();
150 start_critical_timings();
151 /* In many cases the interrupt that ended idle
152 has already called exit_idle. But some idle
153 loops can be woken up without interrupt. */
157 tick_nohz_restart_sched_tick();
159 __preempt_enable_no_resched();
166 /* Prints also some state that isn't saved in the pt_regs */
167 void __show_regs(struct pt_regs
*regs
, int all
)
169 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
170 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
171 unsigned int fsindex
, gsindex
;
172 unsigned int ds
, cs
, es
;
177 board
= dmi_get_system_info(DMI_PRODUCT_NAME
);
180 printk(KERN_INFO
"Pid: %d, comm: %.20s %s %s %.*s %s\n",
181 current
->pid
, current
->comm
, print_tainted(),
182 init_utsname()->release
,
183 (int)strcspn(init_utsname()->version
, " "),
184 init_utsname()->version
, board
);
185 printk(KERN_INFO
"RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
186 printk_address(regs
->ip
, 1);
187 printk(KERN_INFO
"RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
,
188 regs
->sp
, regs
->flags
);
189 printk(KERN_INFO
"RAX: %016lx RBX: %016lx RCX: %016lx\n",
190 regs
->ax
, regs
->bx
, regs
->cx
);
191 printk(KERN_INFO
"RDX: %016lx RSI: %016lx RDI: %016lx\n",
192 regs
->dx
, regs
->si
, regs
->di
);
193 printk(KERN_INFO
"RBP: %016lx R08: %016lx R09: %016lx\n",
194 regs
->bp
, regs
->r8
, regs
->r9
);
195 printk(KERN_INFO
"R10: %016lx R11: %016lx R12: %016lx\n",
196 regs
->r10
, regs
->r11
, regs
->r12
);
197 printk(KERN_INFO
"R13: %016lx R14: %016lx R15: %016lx\n",
198 regs
->r13
, regs
->r14
, regs
->r15
);
200 asm("movl %%ds,%0" : "=r" (ds
));
201 asm("movl %%cs,%0" : "=r" (cs
));
202 asm("movl %%es,%0" : "=r" (es
));
203 asm("movl %%fs,%0" : "=r" (fsindex
));
204 asm("movl %%gs,%0" : "=r" (gsindex
));
206 rdmsrl(MSR_FS_BASE
, fs
);
207 rdmsrl(MSR_GS_BASE
, gs
);
208 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
218 printk(KERN_INFO
"FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
219 fs
, fsindex
, gs
, gsindex
, shadowgs
);
220 printk(KERN_INFO
"CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
,
222 printk(KERN_INFO
"CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
,
228 printk(KERN_INFO
"DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
232 printk(KERN_INFO
"DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
235 void show_regs(struct pt_regs
*regs
)
237 printk(KERN_INFO
"CPU %d:", smp_processor_id());
238 __show_regs(regs
, 1);
239 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
242 void release_thread(struct task_struct
*dead_task
)
245 if (dead_task
->mm
->context
.size
) {
246 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
248 dead_task
->mm
->context
.ldt
,
249 dead_task
->mm
->context
.size
);
255 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
257 struct user_desc ud
= {
264 struct desc_struct
*desc
= t
->thread
.tls_array
;
269 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
271 return get_desc_base(&t
->thread
.tls_array
[tls
]);
275 * This gets called before we allocate a new thread and copy
276 * the current task into it.
278 void prepare_to_copy(struct task_struct
*tsk
)
283 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
284 unsigned long unused
,
285 struct task_struct
*p
, struct pt_regs
*regs
)
288 struct pt_regs
*childregs
;
289 struct task_struct
*me
= current
;
291 childregs
= ((struct pt_regs
*)
292 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
298 childregs
->sp
= (unsigned long)childregs
;
300 p
->thread
.sp
= (unsigned long) childregs
;
301 p
->thread
.sp0
= (unsigned long) (childregs
+1);
302 p
->thread
.usersp
= me
->thread
.usersp
;
304 set_tsk_thread_flag(p
, TIF_FORK
);
306 p
->thread
.fs
= me
->thread
.fs
;
307 p
->thread
.gs
= me
->thread
.gs
;
309 savesegment(gs
, p
->thread
.gsindex
);
310 savesegment(fs
, p
->thread
.fsindex
);
311 savesegment(es
, p
->thread
.es
);
312 savesegment(ds
, p
->thread
.ds
);
314 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
315 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
316 if (!p
->thread
.io_bitmap_ptr
) {
317 p
->thread
.io_bitmap_max
= 0;
320 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
322 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
326 * Set a new TLS for the child thread?
328 if (clone_flags
& CLONE_SETTLS
) {
329 #ifdef CONFIG_IA32_EMULATION
330 if (test_thread_flag(TIF_IA32
))
331 err
= do_set_thread_area(p
, -1,
332 (struct user_desc __user
*)childregs
->si
, 0);
335 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
340 ds_copy_thread(p
, me
);
342 clear_tsk_thread_flag(p
, TIF_DEBUGCTLMSR
);
343 p
->thread
.debugctlmsr
= 0;
347 if (err
&& p
->thread
.io_bitmap_ptr
) {
348 kfree(p
->thread
.io_bitmap_ptr
);
349 p
->thread
.io_bitmap_max
= 0;
355 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
363 percpu_write(old_rsp
, new_sp
);
364 regs
->cs
= __USER_CS
;
365 regs
->ss
= __USER_DS
;
369 * Free the old FP and other extended state
371 free_thread_xstate(current
);
373 EXPORT_SYMBOL_GPL(start_thread
);
376 * switch_to(x,y) should switch tasks from x to y.
378 * This could still be optimized:
379 * - fold all the options into a flag word and test it with a single test.
380 * - could test fs/gs bitsliced
382 * Kprobes not supported here. Set the probe on schedule instead.
383 * Function graph tracer not supported too.
385 __notrace_funcgraph
struct task_struct
*
386 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
388 struct thread_struct
*prev
= &prev_p
->thread
;
389 struct thread_struct
*next
= &next_p
->thread
;
390 int cpu
= smp_processor_id();
391 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
392 unsigned fsindex
, gsindex
;
394 /* we're going to use this soon, after a few expensive things */
395 if (next_p
->fpu_counter
> 5)
396 prefetch(next
->xstate
);
399 * Reload esp0, LDT and the page table pointer:
405 * This won't pick up thread selector changes, but I guess that is ok.
407 savesegment(es
, prev
->es
);
408 if (unlikely(next
->es
| prev
->es
))
409 loadsegment(es
, next
->es
);
411 savesegment(ds
, prev
->ds
);
412 if (unlikely(next
->ds
| prev
->ds
))
413 loadsegment(ds
, next
->ds
);
416 /* We must save %fs and %gs before load_TLS() because
417 * %fs and %gs may be cleared by load_TLS().
419 * (e.g. xen_load_tls())
421 savesegment(fs
, fsindex
);
422 savesegment(gs
, gsindex
);
427 * Leave lazy mode, flushing any hypercalls made here.
428 * This must be done before restoring TLS segments so
429 * the GDT and LDT are properly updated, and must be
430 * done before math_state_restore, so the TS bit is up
433 arch_leave_lazy_cpu_mode();
438 * Segment register != 0 always requires a reload. Also
439 * reload when it has changed. When prev process used 64bit
440 * base always reload to avoid an information leak.
442 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
443 loadsegment(fs
, next
->fsindex
);
445 * Check if the user used a selector != 0; if yes
446 * clear 64bit base, since overloaded base is always
447 * mapped to the Null selector
452 /* when next process has a 64bit base use it */
454 wrmsrl(MSR_FS_BASE
, next
->fs
);
455 prev
->fsindex
= fsindex
;
457 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
458 load_gs_index(next
->gsindex
);
463 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
464 prev
->gsindex
= gsindex
;
466 /* Must be after DS reload */
470 * Switch the PDA and FPU contexts.
472 prev
->usersp
= percpu_read(old_rsp
);
473 percpu_write(old_rsp
, next
->usersp
);
474 percpu_write(current_task
, next_p
);
476 percpu_write(kernel_stack
,
477 (unsigned long)task_stack_page(next_p
) +
478 THREAD_SIZE
- KERNEL_STACK_OFFSET
);
481 * Now maybe reload the debug registers and handle I/O bitmaps
483 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
484 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
485 __switch_to_xtra(prev_p
, next_p
, tss
);
487 /* If the task has used fpu the last 5 timeslices, just do a full
488 * restore of the math state immediately to avoid the trap; the
489 * chances of needing FPU soon are obviously high now
491 * tsk_used_math() checks prevent calling math_state_restore(),
492 * which can sleep in the case of !tsk_used_math()
494 if (tsk_used_math(next_p
) && next_p
->fpu_counter
> 5)
495 math_state_restore();
500 * sys_execve() executes a new program.
503 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
504 char __user
* __user
*envp
, struct pt_regs
*regs
)
509 filename
= getname(name
);
510 error
= PTR_ERR(filename
);
511 if (IS_ERR(filename
))
513 error
= do_execve(filename
, argv
, envp
, regs
);
518 void set_personality_64bit(void)
520 /* inherit personality from parent */
522 /* Make sure to be in 64bit mode */
523 clear_thread_flag(TIF_IA32
);
525 /* TBD: overwrites user setup. Should have two bits.
526 But 64bit processes have always behaved this way,
527 so it's not too bad. The main problem is just that
528 32bit childs are affected again. */
529 current
->personality
&= ~READ_IMPLIES_EXEC
;
533 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
534 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
538 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
541 unsigned long get_wchan(struct task_struct
*p
)
547 if (!p
|| p
== current
|| p
->state
== TASK_RUNNING
)
549 stack
= (unsigned long)task_stack_page(p
);
550 if (p
->thread
.sp
< stack
|| p
->thread
.sp
>= stack
+THREAD_SIZE
)
552 fp
= *(u64
*)(p
->thread
.sp
);
554 if (fp
< (unsigned long)stack
||
555 fp
>= (unsigned long)stack
+THREAD_SIZE
)
558 if (!in_sched_functions(ip
))
561 } while (count
++ < 16);
565 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
568 int doit
= task
== current
;
573 if (addr
>= TASK_SIZE_OF(task
))
576 /* handle small bases via the GDT because that's faster to
578 if (addr
<= 0xffffffff) {
579 set_32bit_tls(task
, GS_TLS
, addr
);
581 load_TLS(&task
->thread
, cpu
);
582 load_gs_index(GS_TLS_SEL
);
584 task
->thread
.gsindex
= GS_TLS_SEL
;
587 task
->thread
.gsindex
= 0;
588 task
->thread
.gs
= addr
;
591 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
597 /* Not strictly needed for fs, but do it for symmetry
599 if (addr
>= TASK_SIZE_OF(task
))
602 /* handle small bases via the GDT because that's faster to
604 if (addr
<= 0xffffffff) {
605 set_32bit_tls(task
, FS_TLS
, addr
);
607 load_TLS(&task
->thread
, cpu
);
608 loadsegment(fs
, FS_TLS_SEL
);
610 task
->thread
.fsindex
= FS_TLS_SEL
;
613 task
->thread
.fsindex
= 0;
614 task
->thread
.fs
= addr
;
616 /* set the selector to 0 to not confuse
619 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
626 if (task
->thread
.fsindex
== FS_TLS_SEL
)
627 base
= read_32bit_tls(task
, FS_TLS
);
629 rdmsrl(MSR_FS_BASE
, base
);
631 base
= task
->thread
.fs
;
632 ret
= put_user(base
, (unsigned long __user
*)addr
);
638 if (task
->thread
.gsindex
== GS_TLS_SEL
)
639 base
= read_32bit_tls(task
, GS_TLS
);
641 savesegment(gs
, gsindex
);
643 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
645 base
= task
->thread
.gs
;
647 base
= task
->thread
.gs
;
648 ret
= put_user(base
, (unsigned long __user
*)addr
);
660 long sys_arch_prctl(int code
, unsigned long addr
)
662 return do_arch_prctl(current
, code
, addr
);
665 unsigned long arch_align_stack(unsigned long sp
)
667 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
668 sp
-= get_random_int() % 8192;
672 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
674 unsigned long range_end
= mm
->brk
+ 0x02000000;
675 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;