Merge branch 'x86-debug-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6/cjktty.git] / arch / x86 / kernel / process_64.c
bloba98fe88fab64df0a9b4e4e497939245e7a338fc6
1 /*
2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
7 * X86-64 port
8 * Andi Kleen.
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/utsname.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/tick.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/dmi.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/idle.h>
53 #include <asm/syscalls.h>
54 #include <asm/ds.h>
55 #include <asm/debugreg.h>
57 asmlinkage extern void ret_from_fork(void);
59 DEFINE_PER_CPU(unsigned long, old_rsp);
60 static DEFINE_PER_CPU(unsigned char, is_idle);
62 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
64 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
66 void idle_notifier_register(struct notifier_block *n)
68 atomic_notifier_chain_register(&idle_notifier, n);
70 EXPORT_SYMBOL_GPL(idle_notifier_register);
72 void idle_notifier_unregister(struct notifier_block *n)
74 atomic_notifier_chain_unregister(&idle_notifier, n);
76 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
78 void enter_idle(void)
80 percpu_write(is_idle, 1);
81 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
84 static void __exit_idle(void)
86 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
87 return;
88 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
91 /* Called from interrupts to signify idle end */
92 void exit_idle(void)
94 /* idle loop has pid 0 */
95 if (current->pid)
96 return;
97 __exit_idle();
100 #ifndef CONFIG_SMP
101 static inline void play_dead(void)
103 BUG();
105 #endif
108 * The idle thread. There's no useful work to be
109 * done, so just try to conserve power and have a
110 * low exit latency (ie sit in a loop waiting for
111 * somebody to say that they'd like to reschedule)
113 void cpu_idle(void)
115 current_thread_info()->status |= TS_POLLING;
118 * If we're the non-boot CPU, nothing set the stack canary up
119 * for us. CPU0 already has it initialized but no harm in
120 * doing it again. This is a good place for updating it, as
121 * we wont ever return from this function (so the invalid
122 * canaries already on the stack wont ever trigger).
124 boot_init_stack_canary();
126 /* endless idle loop with no priority at all */
127 while (1) {
128 tick_nohz_stop_sched_tick(1);
129 while (!need_resched()) {
131 rmb();
133 if (cpu_is_offline(smp_processor_id()))
134 play_dead();
136 * Idle routines should keep interrupts disabled
137 * from here on, until they go to idle.
138 * Otherwise, idle callbacks can misfire.
140 local_irq_disable();
141 enter_idle();
142 /* Don't trace irqs off for idle */
143 stop_critical_timings();
144 pm_idle();
145 start_critical_timings();
146 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */
149 __exit_idle();
152 tick_nohz_restart_sched_tick();
153 preempt_enable_no_resched();
154 schedule();
155 preempt_disable();
159 /* Prints also some state that isn't saved in the pt_regs */
160 void __show_regs(struct pt_regs *regs, int all)
162 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163 unsigned long d0, d1, d2, d3, d6, d7;
164 unsigned int fsindex, gsindex;
165 unsigned int ds, cs, es;
166 const char *board;
168 printk("\n");
169 print_modules();
170 board = dmi_get_system_info(DMI_PRODUCT_NAME);
171 if (!board)
172 board = "";
173 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
174 current->pid, current->comm, print_tainted(),
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version, board);
178 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
179 printk_address(regs->ip, 1);
180 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
181 regs->sp, regs->flags);
182 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
183 regs->ax, regs->bx, regs->cx);
184 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
185 regs->dx, regs->si, regs->di);
186 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
187 regs->bp, regs->r8, regs->r9);
188 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
189 regs->r10, regs->r11, regs->r12);
190 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
191 regs->r13, regs->r14, regs->r15);
193 asm("movl %%ds,%0" : "=r" (ds));
194 asm("movl %%cs,%0" : "=r" (cs));
195 asm("movl %%es,%0" : "=r" (es));
196 asm("movl %%fs,%0" : "=r" (fsindex));
197 asm("movl %%gs,%0" : "=r" (gsindex));
199 rdmsrl(MSR_FS_BASE, fs);
200 rdmsrl(MSR_GS_BASE, gs);
201 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
203 if (!all)
204 return;
206 cr0 = read_cr0();
207 cr2 = read_cr2();
208 cr3 = read_cr3();
209 cr4 = read_cr4();
211 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
212 fs, fsindex, gs, gsindex, shadowgs);
213 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
214 es, cr0);
215 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
216 cr4);
218 get_debugreg(d0, 0);
219 get_debugreg(d1, 1);
220 get_debugreg(d2, 2);
221 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
222 get_debugreg(d3, 3);
223 get_debugreg(d6, 6);
224 get_debugreg(d7, 7);
225 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
228 void show_regs(struct pt_regs *regs)
230 show_registers(regs);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
234 void release_thread(struct task_struct *dead_task)
236 if (dead_task->mm) {
237 if (dead_task->mm->context.size) {
238 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
239 dead_task->comm,
240 dead_task->mm->context.ldt,
241 dead_task->mm->context.size);
242 BUG();
247 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
249 struct user_desc ud = {
250 .base_addr = addr,
251 .limit = 0xfffff,
252 .seg_32bit = 1,
253 .limit_in_pages = 1,
254 .useable = 1,
256 struct desc_struct *desc = t->thread.tls_array;
257 desc += tls;
258 fill_ldt(desc, &ud);
261 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
263 return get_desc_base(&t->thread.tls_array[tls]);
267 * This gets called before we allocate a new thread and copy
268 * the current task into it.
270 void prepare_to_copy(struct task_struct *tsk)
272 unlazy_fpu(tsk);
275 int copy_thread(unsigned long clone_flags, unsigned long sp,
276 unsigned long unused,
277 struct task_struct *p, struct pt_regs *regs)
279 int err;
280 struct pt_regs *childregs;
281 struct task_struct *me = current;
283 childregs = ((struct pt_regs *)
284 (THREAD_SIZE + task_stack_page(p))) - 1;
285 *childregs = *regs;
287 childregs->ax = 0;
288 childregs->sp = sp;
289 if (sp == ~0UL)
290 childregs->sp = (unsigned long)childregs;
292 p->thread.sp = (unsigned long) childregs;
293 p->thread.sp0 = (unsigned long) (childregs+1);
294 p->thread.usersp = me->thread.usersp;
296 set_tsk_thread_flag(p, TIF_FORK);
298 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs;
300 p->thread.io_bitmap_ptr = NULL;
302 savesegment(gs, p->thread.gsindex);
303 savesegment(fs, p->thread.fsindex);
304 savesegment(es, p->thread.es);
305 savesegment(ds, p->thread.ds);
307 err = -ENOMEM;
308 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
310 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
311 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
312 if (!p->thread.io_bitmap_ptr) {
313 p->thread.io_bitmap_max = 0;
314 return -ENOMEM;
316 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
317 IO_BITMAP_BYTES);
318 set_tsk_thread_flag(p, TIF_IO_BITMAP);
322 * Set a new TLS for the child thread?
324 if (clone_flags & CLONE_SETTLS) {
325 #ifdef CONFIG_IA32_EMULATION
326 if (test_thread_flag(TIF_IA32))
327 err = do_set_thread_area(p, -1,
328 (struct user_desc __user *)childregs->si, 0);
329 else
330 #endif
331 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
332 if (err)
333 goto out;
336 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
337 p->thread.ds_ctx = NULL;
339 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
340 p->thread.debugctlmsr = 0;
342 err = 0;
343 out:
344 if (err && p->thread.io_bitmap_ptr) {
345 kfree(p->thread.io_bitmap_ptr);
346 p->thread.io_bitmap_max = 0;
349 return err;
352 void
353 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
355 loadsegment(fs, 0);
356 loadsegment(es, 0);
357 loadsegment(ds, 0);
358 load_gs_index(0);
359 regs->ip = new_ip;
360 regs->sp = new_sp;
361 percpu_write(old_rsp, new_sp);
362 regs->cs = __USER_CS;
363 regs->ss = __USER_DS;
364 regs->flags = 0x200;
365 set_fs(USER_DS);
367 * Free the old FP and other extended state
369 free_thread_xstate(current);
371 EXPORT_SYMBOL_GPL(start_thread);
374 * switch_to(x,y) should switch tasks from x to y.
376 * This could still be optimized:
377 * - fold all the options into a flag word and test it with a single test.
378 * - could test fs/gs bitsliced
380 * Kprobes not supported here. Set the probe on schedule instead.
381 * Function graph tracer not supported too.
383 __notrace_funcgraph struct task_struct *
384 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
386 struct thread_struct *prev = &prev_p->thread;
387 struct thread_struct *next = &next_p->thread;
388 int cpu = smp_processor_id();
389 struct tss_struct *tss = &per_cpu(init_tss, cpu);
390 unsigned fsindex, gsindex;
391 bool preload_fpu;
394 * If the task has used fpu the last 5 timeslices, just do a full
395 * restore of the math state immediately to avoid the trap; the
396 * chances of needing FPU soon are obviously high now
398 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
400 /* we're going to use this soon, after a few expensive things */
401 if (preload_fpu)
402 prefetch(next->xstate);
405 * Reload esp0, LDT and the page table pointer:
407 load_sp0(tss, next);
410 * Switch DS and ES.
411 * This won't pick up thread selector changes, but I guess that is ok.
413 savesegment(es, prev->es);
414 if (unlikely(next->es | prev->es))
415 loadsegment(es, next->es);
417 savesegment(ds, prev->ds);
418 if (unlikely(next->ds | prev->ds))
419 loadsegment(ds, next->ds);
422 /* We must save %fs and %gs before load_TLS() because
423 * %fs and %gs may be cleared by load_TLS().
425 * (e.g. xen_load_tls())
427 savesegment(fs, fsindex);
428 savesegment(gs, gsindex);
430 load_TLS(next, cpu);
432 /* Must be after DS reload */
433 unlazy_fpu(prev_p);
435 /* Make sure cpu is ready for new context */
436 if (preload_fpu)
437 clts();
440 * Leave lazy mode, flushing any hypercalls made here.
441 * This must be done before restoring TLS segments so
442 * the GDT and LDT are properly updated, and must be
443 * done before math_state_restore, so the TS bit is up
444 * to date.
446 arch_end_context_switch(next_p);
449 * Switch FS and GS.
451 * Segment register != 0 always requires a reload. Also
452 * reload when it has changed. When prev process used 64bit
453 * base always reload to avoid an information leak.
455 if (unlikely(fsindex | next->fsindex | prev->fs)) {
456 loadsegment(fs, next->fsindex);
458 * Check if the user used a selector != 0; if yes
459 * clear 64bit base, since overloaded base is always
460 * mapped to the Null selector
462 if (fsindex)
463 prev->fs = 0;
465 /* when next process has a 64bit base use it */
466 if (next->fs)
467 wrmsrl(MSR_FS_BASE, next->fs);
468 prev->fsindex = fsindex;
470 if (unlikely(gsindex | next->gsindex | prev->gs)) {
471 load_gs_index(next->gsindex);
472 if (gsindex)
473 prev->gs = 0;
475 if (next->gs)
476 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
477 prev->gsindex = gsindex;
480 * Switch the PDA and FPU contexts.
482 prev->usersp = percpu_read(old_rsp);
483 percpu_write(old_rsp, next->usersp);
484 percpu_write(current_task, next_p);
486 percpu_write(kernel_stack,
487 (unsigned long)task_stack_page(next_p) +
488 THREAD_SIZE - KERNEL_STACK_OFFSET);
491 * Now maybe reload the debug registers and handle I/O bitmaps
493 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
494 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
495 __switch_to_xtra(prev_p, next_p, tss);
498 * Preload the FPU context, now that we've determined that the
499 * task is likely to be using it.
501 if (preload_fpu)
502 __math_state_restore();
504 return prev_p;
508 * sys_execve() executes a new program.
510 asmlinkage
511 long sys_execve(char __user *name, char __user * __user *argv,
512 char __user * __user *envp, struct pt_regs *regs)
514 long error;
515 char *filename;
517 filename = getname(name);
518 error = PTR_ERR(filename);
519 if (IS_ERR(filename))
520 return error;
521 error = do_execve(filename, argv, envp, regs);
522 putname(filename);
523 return error;
526 void set_personality_64bit(void)
528 /* inherit personality from parent */
530 /* Make sure to be in 64bit mode */
531 clear_thread_flag(TIF_IA32);
533 /* TBD: overwrites user setup. Should have two bits.
534 But 64bit processes have always behaved this way,
535 so it's not too bad. The main problem is just that
536 32bit childs are affected again. */
537 current->personality &= ~READ_IMPLIES_EXEC;
540 asmlinkage long
541 sys_clone(unsigned long clone_flags, unsigned long newsp,
542 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
544 if (!newsp)
545 newsp = regs->sp;
546 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
549 unsigned long get_wchan(struct task_struct *p)
551 unsigned long stack;
552 u64 fp, ip;
553 int count = 0;
555 if (!p || p == current || p->state == TASK_RUNNING)
556 return 0;
557 stack = (unsigned long)task_stack_page(p);
558 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
559 return 0;
560 fp = *(u64 *)(p->thread.sp);
561 do {
562 if (fp < (unsigned long)stack ||
563 fp >= (unsigned long)stack+THREAD_SIZE)
564 return 0;
565 ip = *(u64 *)(fp+8);
566 if (!in_sched_functions(ip))
567 return ip;
568 fp = *(u64 *)fp;
569 } while (count++ < 16);
570 return 0;
573 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
575 int ret = 0;
576 int doit = task == current;
577 int cpu;
579 switch (code) {
580 case ARCH_SET_GS:
581 if (addr >= TASK_SIZE_OF(task))
582 return -EPERM;
583 cpu = get_cpu();
584 /* handle small bases via the GDT because that's faster to
585 switch. */
586 if (addr <= 0xffffffff) {
587 set_32bit_tls(task, GS_TLS, addr);
588 if (doit) {
589 load_TLS(&task->thread, cpu);
590 load_gs_index(GS_TLS_SEL);
592 task->thread.gsindex = GS_TLS_SEL;
593 task->thread.gs = 0;
594 } else {
595 task->thread.gsindex = 0;
596 task->thread.gs = addr;
597 if (doit) {
598 load_gs_index(0);
599 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
602 put_cpu();
603 break;
604 case ARCH_SET_FS:
605 /* Not strictly needed for fs, but do it for symmetry
606 with gs */
607 if (addr >= TASK_SIZE_OF(task))
608 return -EPERM;
609 cpu = get_cpu();
610 /* handle small bases via the GDT because that's faster to
611 switch. */
612 if (addr <= 0xffffffff) {
613 set_32bit_tls(task, FS_TLS, addr);
614 if (doit) {
615 load_TLS(&task->thread, cpu);
616 loadsegment(fs, FS_TLS_SEL);
618 task->thread.fsindex = FS_TLS_SEL;
619 task->thread.fs = 0;
620 } else {
621 task->thread.fsindex = 0;
622 task->thread.fs = addr;
623 if (doit) {
624 /* set the selector to 0 to not confuse
625 __switch_to */
626 loadsegment(fs, 0);
627 ret = checking_wrmsrl(MSR_FS_BASE, addr);
630 put_cpu();
631 break;
632 case ARCH_GET_FS: {
633 unsigned long base;
634 if (task->thread.fsindex == FS_TLS_SEL)
635 base = read_32bit_tls(task, FS_TLS);
636 else if (doit)
637 rdmsrl(MSR_FS_BASE, base);
638 else
639 base = task->thread.fs;
640 ret = put_user(base, (unsigned long __user *)addr);
641 break;
643 case ARCH_GET_GS: {
644 unsigned long base;
645 unsigned gsindex;
646 if (task->thread.gsindex == GS_TLS_SEL)
647 base = read_32bit_tls(task, GS_TLS);
648 else if (doit) {
649 savesegment(gs, gsindex);
650 if (gsindex)
651 rdmsrl(MSR_KERNEL_GS_BASE, base);
652 else
653 base = task->thread.gs;
654 } else
655 base = task->thread.gs;
656 ret = put_user(base, (unsigned long __user *)addr);
657 break;
660 default:
661 ret = -EINVAL;
662 break;
665 return ret;
668 long sys_arch_prctl(int code, unsigned long addr)
670 return do_arch_prctl(current, code, addr);
673 unsigned long KSTK_ESP(struct task_struct *task)
675 return (test_tsk_thread_flag(task, TIF_IA32)) ?
676 (task_pt_regs(task)->sp) : ((task)->thread.usersp);