added 2.6.29.6 aldebaran kernel
[nao-ulib.git] / kernel / 2.6.29.6-aldebaran-rt / arch / x86 / kernel / process_64.c
blob2c33df9bde960d498757cf8d10194bc8a7a07509
1 /*
2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
7 * X86-64 port
8 * Andi Kleen.
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <stdarg.h>
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
42 #include <linux/io.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
49 #include <asm/i387.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
52 #include <asm/desc.h>
53 #include <asm/proto.h>
54 #include <asm/ia32.h>
55 #include <asm/idle.h>
56 #include <asm/syscalls.h>
57 #include <asm/ds.h>
59 asmlinkage extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62 EXPORT_PER_CPU_SYMBOL(current_task);
64 DEFINE_PER_CPU(unsigned long, old_rsp);
65 static DEFINE_PER_CPU(unsigned char, is_idle);
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 void idle_notifier_unregister(struct notifier_block *n)
79 atomic_notifier_chain_unregister(&idle_notifier, n);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
83 void enter_idle(void)
85 percpu_write(is_idle, 1);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92 return;
93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96 /* Called from interrupts to signify idle end */
97 void exit_idle(void)
99 /* idle loop has pid 0 */
100 if (current->pid)
101 return;
102 __exit_idle();
105 #ifndef CONFIG_SMP
106 static inline void play_dead(void)
108 BUG();
110 #endif
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
118 void cpu_idle(void)
120 current_thread_info()->status |= TS_POLLING;
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
129 boot_init_stack_canary();
131 /* endless idle loop with no priority at all */
132 while (1) {
133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) {
136 rmb();
138 if (cpu_is_offline(smp_processor_id()))
139 play_dead();
141 * Idle routines should keep interrupts disabled
142 * from here on, until they go to idle.
143 * Otherwise, idle callbacks can misfire.
145 local_irq_disable();
146 enter_idle();
147 /* Don't trace irqs off for idle */
148 stop_critical_timings();
149 pm_idle();
150 start_critical_timings();
151 /* In many cases the interrupt that ended idle
152 has already called exit_idle. But some idle
153 loops can be woken up without interrupt. */
154 __exit_idle();
157 tick_nohz_restart_sched_tick();
158 local_irq_disable();
159 __preempt_enable_no_resched();
160 __schedule();
161 preempt_disable();
162 local_irq_enable();
166 /* Prints also some state that isn't saved in the pt_regs */
167 void __show_regs(struct pt_regs *regs, int all)
169 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
170 unsigned long d0, d1, d2, d3, d6, d7;
171 unsigned int fsindex, gsindex;
172 unsigned int ds, cs, es;
173 const char *board;
175 printk("\n");
176 print_modules();
177 board = dmi_get_system_info(DMI_PRODUCT_NAME);
178 if (!board)
179 board = "";
180 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version, board);
185 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
186 printk_address(regs->ip, 1);
187 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
188 regs->sp, regs->flags);
189 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
190 regs->ax, regs->bx, regs->cx);
191 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
192 regs->dx, regs->si, regs->di);
193 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
194 regs->bp, regs->r8, regs->r9);
195 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
196 regs->r10, regs->r11, regs->r12);
197 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
198 regs->r13, regs->r14, regs->r15);
200 asm("movl %%ds,%0" : "=r" (ds));
201 asm("movl %%cs,%0" : "=r" (cs));
202 asm("movl %%es,%0" : "=r" (es));
203 asm("movl %%fs,%0" : "=r" (fsindex));
204 asm("movl %%gs,%0" : "=r" (gsindex));
206 rdmsrl(MSR_FS_BASE, fs);
207 rdmsrl(MSR_GS_BASE, gs);
208 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
210 if (!all)
211 return;
213 cr0 = read_cr0();
214 cr2 = read_cr2();
215 cr3 = read_cr3();
216 cr4 = read_cr4();
218 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
219 fs, fsindex, gs, gsindex, shadowgs);
220 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
221 es, cr0);
222 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
223 cr4);
225 get_debugreg(d0, 0);
226 get_debugreg(d1, 1);
227 get_debugreg(d2, 2);
228 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
229 get_debugreg(d3, 3);
230 get_debugreg(d6, 6);
231 get_debugreg(d7, 7);
232 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
235 void show_regs(struct pt_regs *regs)
237 printk(KERN_INFO "CPU %d:", smp_processor_id());
238 __show_regs(regs, 1);
239 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
242 void release_thread(struct task_struct *dead_task)
244 if (dead_task->mm) {
245 if (dead_task->mm->context.size) {
246 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
247 dead_task->comm,
248 dead_task->mm->context.ldt,
249 dead_task->mm->context.size);
250 BUG();
255 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
257 struct user_desc ud = {
258 .base_addr = addr,
259 .limit = 0xfffff,
260 .seg_32bit = 1,
261 .limit_in_pages = 1,
262 .useable = 1,
264 struct desc_struct *desc = t->thread.tls_array;
265 desc += tls;
266 fill_ldt(desc, &ud);
269 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
271 return get_desc_base(&t->thread.tls_array[tls]);
275 * This gets called before we allocate a new thread and copy
276 * the current task into it.
278 void prepare_to_copy(struct task_struct *tsk)
280 unlazy_fpu(tsk);
283 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
284 unsigned long unused,
285 struct task_struct *p, struct pt_regs *regs)
287 int err;
288 struct pt_regs *childregs;
289 struct task_struct *me = current;
291 childregs = ((struct pt_regs *)
292 (THREAD_SIZE + task_stack_page(p))) - 1;
293 *childregs = *regs;
295 childregs->ax = 0;
296 childregs->sp = sp;
297 if (sp == ~0UL)
298 childregs->sp = (unsigned long)childregs;
300 p->thread.sp = (unsigned long) childregs;
301 p->thread.sp0 = (unsigned long) (childregs+1);
302 p->thread.usersp = me->thread.usersp;
304 set_tsk_thread_flag(p, TIF_FORK);
306 p->thread.fs = me->thread.fs;
307 p->thread.gs = me->thread.gs;
309 savesegment(gs, p->thread.gsindex);
310 savesegment(fs, p->thread.fsindex);
311 savesegment(es, p->thread.es);
312 savesegment(ds, p->thread.ds);
314 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
315 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
316 if (!p->thread.io_bitmap_ptr) {
317 p->thread.io_bitmap_max = 0;
318 return -ENOMEM;
320 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
321 IO_BITMAP_BYTES);
322 set_tsk_thread_flag(p, TIF_IO_BITMAP);
326 * Set a new TLS for the child thread?
328 if (clone_flags & CLONE_SETTLS) {
329 #ifdef CONFIG_IA32_EMULATION
330 if (test_thread_flag(TIF_IA32))
331 err = do_set_thread_area(p, -1,
332 (struct user_desc __user *)childregs->si, 0);
333 else
334 #endif
335 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
336 if (err)
337 goto out;
340 ds_copy_thread(p, me);
342 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
343 p->thread.debugctlmsr = 0;
345 err = 0;
346 out:
347 if (err && p->thread.io_bitmap_ptr) {
348 kfree(p->thread.io_bitmap_ptr);
349 p->thread.io_bitmap_max = 0;
351 return err;
354 void
355 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
357 loadsegment(fs, 0);
358 loadsegment(es, 0);
359 loadsegment(ds, 0);
360 load_gs_index(0);
361 regs->ip = new_ip;
362 regs->sp = new_sp;
363 percpu_write(old_rsp, new_sp);
364 regs->cs = __USER_CS;
365 regs->ss = __USER_DS;
366 regs->flags = 0x200;
367 set_fs(USER_DS);
369 * Free the old FP and other extended state
371 free_thread_xstate(current);
373 EXPORT_SYMBOL_GPL(start_thread);
376 * switch_to(x,y) should switch tasks from x to y.
378 * This could still be optimized:
379 * - fold all the options into a flag word and test it with a single test.
380 * - could test fs/gs bitsliced
382 * Kprobes not supported here. Set the probe on schedule instead.
383 * Function graph tracer not supported too.
385 __notrace_funcgraph struct task_struct *
386 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
388 struct thread_struct *prev = &prev_p->thread;
389 struct thread_struct *next = &next_p->thread;
390 int cpu = smp_processor_id();
391 struct tss_struct *tss = &per_cpu(init_tss, cpu);
392 unsigned fsindex, gsindex;
394 /* we're going to use this soon, after a few expensive things */
395 if (next_p->fpu_counter > 5)
396 prefetch(next->xstate);
399 * Reload esp0, LDT and the page table pointer:
401 load_sp0(tss, next);
404 * Switch DS and ES.
405 * This won't pick up thread selector changes, but I guess that is ok.
407 savesegment(es, prev->es);
408 if (unlikely(next->es | prev->es))
409 loadsegment(es, next->es);
411 savesegment(ds, prev->ds);
412 if (unlikely(next->ds | prev->ds))
413 loadsegment(ds, next->ds);
416 /* We must save %fs and %gs before load_TLS() because
417 * %fs and %gs may be cleared by load_TLS().
419 * (e.g. xen_load_tls())
421 savesegment(fs, fsindex);
422 savesegment(gs, gsindex);
424 load_TLS(next, cpu);
427 * Leave lazy mode, flushing any hypercalls made here.
428 * This must be done before restoring TLS segments so
429 * the GDT and LDT are properly updated, and must be
430 * done before math_state_restore, so the TS bit is up
431 * to date.
433 arch_leave_lazy_cpu_mode();
436 * Switch FS and GS.
438 * Segment register != 0 always requires a reload. Also
439 * reload when it has changed. When prev process used 64bit
440 * base always reload to avoid an information leak.
442 if (unlikely(fsindex | next->fsindex | prev->fs)) {
443 loadsegment(fs, next->fsindex);
445 * Check if the user used a selector != 0; if yes
446 * clear 64bit base, since overloaded base is always
447 * mapped to the Null selector
449 if (fsindex)
450 prev->fs = 0;
452 /* when next process has a 64bit base use it */
453 if (next->fs)
454 wrmsrl(MSR_FS_BASE, next->fs);
455 prev->fsindex = fsindex;
457 if (unlikely(gsindex | next->gsindex | prev->gs)) {
458 load_gs_index(next->gsindex);
459 if (gsindex)
460 prev->gs = 0;
462 if (next->gs)
463 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
464 prev->gsindex = gsindex;
466 /* Must be after DS reload */
467 unlazy_fpu(prev_p);
470 * Switch the PDA and FPU contexts.
472 prev->usersp = percpu_read(old_rsp);
473 percpu_write(old_rsp, next->usersp);
474 percpu_write(current_task, next_p);
476 percpu_write(kernel_stack,
477 (unsigned long)task_stack_page(next_p) +
478 THREAD_SIZE - KERNEL_STACK_OFFSET);
481 * Now maybe reload the debug registers and handle I/O bitmaps
483 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
484 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
485 __switch_to_xtra(prev_p, next_p, tss);
487 /* If the task has used fpu the last 5 timeslices, just do a full
488 * restore of the math state immediately to avoid the trap; the
489 * chances of needing FPU soon are obviously high now
491 * tsk_used_math() checks prevent calling math_state_restore(),
492 * which can sleep in the case of !tsk_used_math()
494 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
495 math_state_restore();
496 return prev_p;
500 * sys_execve() executes a new program.
502 asmlinkage
503 long sys_execve(char __user *name, char __user * __user *argv,
504 char __user * __user *envp, struct pt_regs *regs)
506 long error;
507 char *filename;
509 filename = getname(name);
510 error = PTR_ERR(filename);
511 if (IS_ERR(filename))
512 return error;
513 error = do_execve(filename, argv, envp, regs);
514 putname(filename);
515 return error;
518 void set_personality_64bit(void)
520 /* inherit personality from parent */
522 /* Make sure to be in 64bit mode */
523 clear_thread_flag(TIF_IA32);
525 /* TBD: overwrites user setup. Should have two bits.
526 But 64bit processes have always behaved this way,
527 so it's not too bad. The main problem is just that
528 32bit childs are affected again. */
529 current->personality &= ~READ_IMPLIES_EXEC;
532 asmlinkage long
533 sys_clone(unsigned long clone_flags, unsigned long newsp,
534 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
536 if (!newsp)
537 newsp = regs->sp;
538 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
541 unsigned long get_wchan(struct task_struct *p)
543 unsigned long stack;
544 u64 fp, ip;
545 int count = 0;
547 if (!p || p == current || p->state == TASK_RUNNING)
548 return 0;
549 stack = (unsigned long)task_stack_page(p);
550 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
551 return 0;
552 fp = *(u64 *)(p->thread.sp);
553 do {
554 if (fp < (unsigned long)stack ||
555 fp >= (unsigned long)stack+THREAD_SIZE)
556 return 0;
557 ip = *(u64 *)(fp+8);
558 if (!in_sched_functions(ip))
559 return ip;
560 fp = *(u64 *)fp;
561 } while (count++ < 16);
562 return 0;
565 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
567 int ret = 0;
568 int doit = task == current;
569 int cpu;
571 switch (code) {
572 case ARCH_SET_GS:
573 if (addr >= TASK_SIZE_OF(task))
574 return -EPERM;
575 cpu = get_cpu();
576 /* handle small bases via the GDT because that's faster to
577 switch. */
578 if (addr <= 0xffffffff) {
579 set_32bit_tls(task, GS_TLS, addr);
580 if (doit) {
581 load_TLS(&task->thread, cpu);
582 load_gs_index(GS_TLS_SEL);
584 task->thread.gsindex = GS_TLS_SEL;
585 task->thread.gs = 0;
586 } else {
587 task->thread.gsindex = 0;
588 task->thread.gs = addr;
589 if (doit) {
590 load_gs_index(0);
591 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
594 put_cpu();
595 break;
596 case ARCH_SET_FS:
597 /* Not strictly needed for fs, but do it for symmetry
598 with gs */
599 if (addr >= TASK_SIZE_OF(task))
600 return -EPERM;
601 cpu = get_cpu();
602 /* handle small bases via the GDT because that's faster to
603 switch. */
604 if (addr <= 0xffffffff) {
605 set_32bit_tls(task, FS_TLS, addr);
606 if (doit) {
607 load_TLS(&task->thread, cpu);
608 loadsegment(fs, FS_TLS_SEL);
610 task->thread.fsindex = FS_TLS_SEL;
611 task->thread.fs = 0;
612 } else {
613 task->thread.fsindex = 0;
614 task->thread.fs = addr;
615 if (doit) {
616 /* set the selector to 0 to not confuse
617 __switch_to */
618 loadsegment(fs, 0);
619 ret = checking_wrmsrl(MSR_FS_BASE, addr);
622 put_cpu();
623 break;
624 case ARCH_GET_FS: {
625 unsigned long base;
626 if (task->thread.fsindex == FS_TLS_SEL)
627 base = read_32bit_tls(task, FS_TLS);
628 else if (doit)
629 rdmsrl(MSR_FS_BASE, base);
630 else
631 base = task->thread.fs;
632 ret = put_user(base, (unsigned long __user *)addr);
633 break;
635 case ARCH_GET_GS: {
636 unsigned long base;
637 unsigned gsindex;
638 if (task->thread.gsindex == GS_TLS_SEL)
639 base = read_32bit_tls(task, GS_TLS);
640 else if (doit) {
641 savesegment(gs, gsindex);
642 if (gsindex)
643 rdmsrl(MSR_KERNEL_GS_BASE, base);
644 else
645 base = task->thread.gs;
646 } else
647 base = task->thread.gs;
648 ret = put_user(base, (unsigned long __user *)addr);
649 break;
652 default:
653 ret = -EINVAL;
654 break;
657 return ret;
660 long sys_arch_prctl(int code, unsigned long addr)
662 return do_arch_prctl(current, code, addr);
665 unsigned long arch_align_stack(unsigned long sp)
667 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
668 sp -= get_random_int() % 8192;
669 return sp & ~0xf;
672 unsigned long arch_randomize_brk(struct mm_struct *mm)
674 unsigned long range_end = mm->brk + 0x02000000;
675 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;