Merge with Linux 2.5.59.
[linux-2.6/linux-mips.git] / arch / x86_64 / kernel / process.c
blob57a527b71d90406a165c3725fe8ca130394866da
1 /*
2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
19 #define __KERNEL_SYSCALLS__
20 #include <stdarg.h>
22 #include <linux/compiler.h>
23 #include <linux/errno.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
26 #include <linux/mm.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/smp_lock.h>
30 #include <linux/stddef.h>
31 #include <linux/unistd.h>
32 #include <linux/ptrace.h>
33 #include <linux/slab.h>
34 #include <linux/vmalloc.h>
35 #include <linux/user.h>
36 #include <linux/module.h>
37 #include <linux/a.out.h>
38 #include <linux/interrupt.h>
39 #include <linux/config.h>
40 #include <linux/delay.h>
41 #include <linux/init.h>
42 #include <linux/ctype.h>
43 #include <linux/slab.h>
45 #include <asm/uaccess.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/io.h>
49 #include <asm/ldt.h>
50 #include <asm/processor.h>
51 #include <asm/i387.h>
52 #include <asm/desc.h>
53 #include <asm/mmu_context.h>
54 #include <asm/pda.h>
55 #include <asm/prctl.h>
56 #include <asm/kdebug.h>
58 #include <linux/irq.h>
60 asmlinkage extern void ret_from_fork(void);
61 int sys_arch_prctl(int code, unsigned long addr);
63 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
65 int hlt_counter;
68 * Powermanagement idle function, if any..
70 void (*pm_idle)(void);
72 void disable_hlt(void)
74 hlt_counter++;
77 void enable_hlt(void)
79 hlt_counter--;
83 * We use this if we don't have any better
84 * idle routine..
86 void default_idle(void)
88 if (!hlt_counter) {
89 local_irq_disable();
90 if (!need_resched())
91 safe_halt();
92 else
93 local_irq_enable();
98 * On SMP it's slightly faster (but much more power-consuming!)
99 * to poll the ->need_resched flag instead of waiting for the
100 * cross-CPU IPI to arrive. Use this option with caution.
102 static void poll_idle (void)
104 int oldval;
106 local_irq_enable();
109 * Deal with another CPU just having chosen a thread to
110 * run here:
112 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
114 if (!oldval) {
115 set_thread_flag(TIF_POLLING_NRFLAG);
116 asm volatile(
117 "2:"
118 "testl %0,%1;"
119 "rep; nop;"
120 "je 2b;"
122 "i" (_TIF_NEED_RESCHED),
123 "m" (current_thread_info()->flags));
124 } else {
125 set_need_resched();
130 * The idle thread. There's no useful work to be
131 * done, so just try to conserve power and have a
132 * low exit latency (ie sit in a loop waiting for
133 * somebody to say that they'd like to reschedule)
135 void cpu_idle (void)
137 /* endless idle loop with no priority at all */
138 while (1) {
139 void (*idle)(void) = pm_idle;
140 if (!idle)
141 idle = default_idle;
142 while (!need_resched())
143 idle();
144 schedule();
148 static int __init idle_setup (char *str)
150 if (!strncmp(str, "poll", 4)) {
151 printk("using polling idle threads.\n");
152 pm_idle = poll_idle;
155 return 1;
158 __setup("idle=", idle_setup);
161 /* Prints also some state that isn't saved in the pt_regs */
162 void show_regs(struct pt_regs * regs)
164 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
165 unsigned int fsindex,gsindex;
166 unsigned int ds,cs,es;
168 printk("\n");
169 print_modules();
170 printk("Pid: %d, comm: %.20s %s\n", current->pid, current->comm, print_tainted());
171 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
172 printk_address(regs->rip);
173 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
174 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 regs->rax, regs->rbx, regs->rcx);
176 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 regs->rdx, regs->rsi, regs->rdi);
178 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
179 regs->rbp, regs->r8, regs->r9);
180 printk("R10: %016lx R11: %016lx R12: %016lx\n",
181 regs->r10, regs->r11, regs->r12);
182 printk("R13: %016lx R14: %016lx R15: %016lx\n",
183 regs->r13, regs->r14, regs->r15);
185 asm("movl %%ds,%0" : "=r" (ds));
186 asm("movl %%cs,%0" : "=r" (cs));
187 asm("movl %%es,%0" : "=r" (es));
188 asm("movl %%fs,%0" : "=r" (fsindex));
189 asm("movl %%gs,%0" : "=r" (gsindex));
191 rdmsrl(MSR_FS_BASE, fs);
192 rdmsrl(MSR_GS_BASE, gs);
193 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
195 asm("movq %%cr0, %0": "=r" (cr0));
196 asm("movq %%cr2, %0": "=r" (cr2));
197 asm("movq %%cr3, %0": "=r" (cr3));
198 asm("movq %%cr4, %0": "=r" (cr4));
200 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
201 fs,fsindex,gs,gsindex,shadowgs);
202 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
203 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
206 extern void load_gs_index(unsigned);
209 * Free current thread data structures etc..
211 void exit_thread(void)
213 struct task_struct *me = current;
214 if (me->thread.io_bitmap_ptr) {
215 kfree(me->thread.io_bitmap_ptr);
216 me->thread.io_bitmap_ptr = NULL;
217 (init_tss + smp_processor_id())->io_map_base =
218 INVALID_IO_BITMAP_OFFSET;
222 void flush_thread(void)
224 struct task_struct *tsk = current;
226 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
227 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
229 * Forget coprocessor state..
231 clear_fpu(tsk);
232 tsk->used_math = 0;
235 void release_thread(struct task_struct *dead_task)
237 if (dead_task->mm) {
238 if (dead_task->mm->context.size) {
239 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
240 dead_task->comm,
241 dead_task->mm->context.ldt,
242 dead_task->mm->context.size);
243 BUG();
248 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
249 unsigned long unused,
250 struct task_struct * p, struct pt_regs * regs)
252 struct pt_regs * childregs;
253 struct task_struct *me = current;
255 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
257 *childregs = *regs;
259 childregs->rax = 0;
260 childregs->rsp = rsp;
261 if (rsp == ~0) {
262 childregs->rsp = (unsigned long)childregs;
264 p->set_child_tid = p->clear_child_tid = NULL;
266 p->thread.rsp = (unsigned long) childregs;
267 p->thread.rsp0 = (unsigned long) (childregs+1);
268 p->thread.userrsp = current->thread.userrsp;
270 p->thread.rip = (unsigned long) ret_from_fork;
272 p->thread.fs = me->thread.fs;
273 p->thread.gs = me->thread.gs;
275 asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
276 asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
277 asm("movl %%es,%0" : "=m" (p->thread.es));
278 asm("movl %%ds,%0" : "=m" (p->thread.ds));
280 unlazy_fpu(current);
281 p->thread.i387 = current->thread.i387;
283 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
284 p->thread.io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL);
285 if (!p->thread.io_bitmap_ptr)
286 return -ENOMEM;
287 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
288 (IO_BITMAP_SIZE+1)*4);
292 * Set a new TLS for the child thread?
294 if (clone_flags & CLONE_SETTLS) {
295 struct n_desc_struct *desc;
296 struct user_desc info;
297 int idx;
299 if (copy_from_user(&info, test_thread_flag(TIF_IA32) ?
300 (void *)childregs->rsi :
301 (void *)childregs->rdx, sizeof(info)))
302 return -EFAULT;
303 if (LDT_empty(&info))
304 return -EINVAL;
306 idx = info.entry_number;
307 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
308 return -EINVAL;
310 desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
311 desc->a = LDT_entry_a(&info);
312 desc->b = LDT_entry_b(&info);
315 return 0;
319 * This special macro can be used to load a debugging register
321 #define loaddebug(thread,register) \
322 set_debug(thread->debugreg[register], register)
325 * switch_to(x,y) should switch tasks from x to y.
327 * This could still be optimized:
328 * - fold all the options into a flag word and test it with a single test.
329 * - could test fs/gs bitsliced
331 void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
333 struct thread_struct *prev = &prev_p->thread,
334 *next = &next_p->thread;
335 int cpu = smp_processor_id();
336 struct tss_struct *tss = init_tss + cpu;
338 unlazy_fpu(prev_p);
341 * Reload esp0, LDT and the page table pointer:
343 tss->rsp0 = next->rsp0;
346 * Switch DS and ES.
347 * This won't pick up thread selector changes, but I guess that is ok.
349 asm volatile("movl %%es,%0" : "=m" (prev->es));
350 if (unlikely(next->es | prev->es))
351 loadsegment(es, next->es);
353 asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
354 if (unlikely(next->ds | prev->ds))
355 loadsegment(ds, next->ds);
357 load_TLS(next, cpu);
360 * Switch FS and GS.
363 unsigned fsindex;
364 asm volatile("movl %%fs,%0" : "=g" (fsindex));
365 /* segment register != 0 always requires a reload.
366 also reload when it has changed.
367 when prev process used 64bit base always reload
368 to avoid an information leak. */
369 if (unlikely((fsindex | next->fsindex) || prev->fs))
370 loadsegment(fs, next->fsindex);
371 /* check if the user changed the selector
372 if yes clear 64bit base. */
373 if (unlikely(fsindex != prev->fsindex))
374 prev->fs = 0;
375 /* when next process has a 64bit base use it */
376 if (next->fs)
377 wrmsrl(MSR_FS_BASE, next->fs);
378 prev->fsindex = fsindex;
381 unsigned gsindex;
382 asm volatile("movl %%gs,%0" : "=g" (gsindex));
383 if (unlikely((gsindex | next->gsindex) || prev->gs))
384 load_gs_index(next->gsindex);
385 if (unlikely(gsindex != prev->gsindex))
386 prev->gs = 0;
387 if (next->gs)
388 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
389 prev->gsindex = gsindex;
393 * Switch the PDA context.
395 prev->userrsp = read_pda(oldrsp);
396 write_pda(oldrsp, next->userrsp);
397 write_pda(pcurrent, next_p);
398 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
402 * Now maybe reload the debug registers
404 if (unlikely(next->debugreg[7])) {
405 loaddebug(next, 0);
406 loaddebug(next, 1);
407 loaddebug(next, 2);
408 loaddebug(next, 3);
409 /* no 4 and 5 */
410 loaddebug(next, 6);
411 loaddebug(next, 7);
416 * Handle the IO bitmap
418 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
419 if (next->io_bitmap_ptr) {
421 * 4 cachelines copy ... not good, but not that
422 * bad either. Anyone got something better?
423 * This only affects processes which use ioperm().
425 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
426 IO_BITMAP_SIZE*sizeof(u32));
427 tss->io_map_base = IO_BITMAP_OFFSET;
428 } else {
430 * a bitmap offset pointing outside of the TSS limit
431 * causes a nicely controllable SIGSEGV if a process
432 * tries to use a port IO instruction. The first
433 * sys_ioperm() call sets up the bitmap properly.
435 tss->io_map_base = INVALID_IO_BITMAP_OFFSET;
441 * sys_execve() executes a new program.
443 asmlinkage
444 long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
446 long error;
447 char * filename;
449 filename = getname(name);
450 error = PTR_ERR(filename);
451 if (IS_ERR(filename))
452 return error;
453 error = do_execve(filename, argv, envp, &regs);
454 if (error == 0)
455 current->ptrace &= ~PT_DTRACE;
456 putname(filename);
457 return error;
460 void set_personality_64bit(void)
462 /* inherit personality from parent */
464 /* Make sure to be in 64bit mode */
465 clear_thread_flag(TIF_IA32);
468 asmlinkage long sys_fork(struct pt_regs regs)
470 struct task_struct *p;
471 p = do_fork(SIGCHLD, regs.rsp, &regs, 0, NULL, NULL);
472 return IS_ERR(p) ? PTR_ERR(p) : p->pid;
475 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void *parent_tid, void *child_tid, struct pt_regs regs)
477 struct task_struct *p;
478 if (!newsp)
479 newsp = regs.rsp;
480 p = do_fork(clone_flags & ~CLONE_IDLETASK, newsp, &regs, 0,
481 parent_tid, child_tid);
482 return IS_ERR(p) ? PTR_ERR(p) : p->pid;
486 * This is trivial, and on the face of it looks like it
487 * could equally well be done in user mode.
489 * Not so, for quite unobvious reasons - register pressure.
490 * In user mode vfork() cannot have a stack frame, and if
491 * done by calling the "clone()" system call directly, you
492 * do not have enough call-clobbered registers to hold all
493 * the information you need.
495 asmlinkage long sys_vfork(struct pt_regs regs)
497 struct task_struct *p;
498 p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, &regs, 0,
499 NULL, NULL);
500 return IS_ERR(p) ? PTR_ERR(p) : p->pid;
504 * These bracket the sleeping functions..
506 extern void scheduling_functions_start_here(void);
507 extern void scheduling_functions_end_here(void);
508 #define first_sched ((unsigned long) scheduling_functions_start_here)
509 #define last_sched ((unsigned long) scheduling_functions_end_here)
511 unsigned long get_wchan(struct task_struct *p)
513 u64 fp,rip;
514 int count = 0;
516 if (!p || p == current || p->state==TASK_RUNNING)
517 return 0;
518 if (p->thread.rsp < (u64)p || p->thread.rsp > (u64)p + THREAD_SIZE)
519 return 0;
520 fp = *(u64 *)(p->thread.rsp);
521 do {
522 if (fp < (unsigned long)p || fp > (unsigned long)p+THREAD_SIZE)
523 return 0;
524 rip = *(u64 *)(fp+8);
525 if (rip < first_sched || rip >= last_sched)
526 return rip;
527 fp = *(u64 *)fp;
528 } while (count++ < 16);
529 return 0;
531 #undef last_sched
532 #undef first_sched
534 int sys_arch_prctl(int code, unsigned long addr)
536 int ret = 0;
538 switch (code) {
539 case ARCH_SET_GS:
540 #if 1
541 /* For now. We still have one unsolved bug in long gs base context
542 switch handling. */
543 return -EINVAL;
544 #else
545 if (addr >= TASK_SIZE)
546 return -EPERM;
547 get_cpu();
548 load_gs_index(__USER_LONGBASE);
549 current->thread.gsindex = __USER_LONGBASE;
550 current->thread.gs = addr;
551 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
552 put_cpu();
553 break;
554 #endif
555 case ARCH_SET_FS:
556 /* Not strictly needed for fs, but do it for symmetry
557 with gs */
558 if (addr >= TASK_SIZE)
559 return -EPERM;
560 get_cpu();
561 asm volatile("movl %0,%%fs" :: "r" (__USER_LONGBASE));
562 current->thread.fsindex = __USER_LONGBASE;
563 current->thread.fs = addr;
564 ret = checking_wrmsrl(MSR_FS_BASE, addr);
565 put_cpu();
566 break;
568 /* Returned value may not be correct when the user changed fs/gs */
569 case ARCH_GET_FS:
570 ret = put_user(current->thread.fs, (unsigned long *)addr);
571 break;
573 case ARCH_GET_GS:
574 ret = put_user(current->thread.gs, (unsigned long *)addr);
575 break;
577 default:
578 ret = -EINVAL;
579 break;
582 return ret;
586 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
588 static int get_free_idx(void)
590 struct thread_struct *t = &current->thread;
591 int idx;
593 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
594 if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
595 return idx + GDT_ENTRY_TLS_MIN;
596 return -ESRCH;
600 * Set a given TLS descriptor:
601 * When you want addresses > 32bit use arch_prctl()
603 int do_set_thread_area(struct thread_struct *t, struct user_desc *u_info)
605 struct user_desc info;
606 struct n_desc_struct *desc;
607 int cpu, idx;
609 if (copy_from_user(&info, u_info, sizeof(info)))
610 return -EFAULT;
612 idx = info.entry_number;
615 * index -1 means the kernel should try to find and
616 * allocate an empty descriptor:
618 if (idx == -1) {
619 idx = get_free_idx();
620 if (idx < 0)
621 return idx;
622 if (put_user(idx, &u_info->entry_number))
623 return -EFAULT;
626 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
627 return -EINVAL;
629 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
632 * We must not get preempted while modifying the TLS.
634 cpu = get_cpu();
636 if (LDT_empty(&info)) {
637 desc->a = 0;
638 desc->b = 0;
639 } else {
640 desc->a = LDT_entry_a(&info);
641 desc->b = LDT_entry_b(&info);
643 if (t == &current->thread)
644 load_TLS(t, cpu);
646 put_cpu();
647 return 0;
650 asmlinkage int sys_set_thread_area(struct user_desc *u_info)
652 return do_set_thread_area(&current->thread, u_info);
657 * Get the current Thread-Local Storage area:
660 #define GET_BASE(desc) ( \
661 (((desc)->a >> 16) & 0x0000ffff) | \
662 (((desc)->b << 16) & 0x00ff0000) | \
663 ( (desc)->b & 0xff000000) )
665 #define GET_LIMIT(desc) ( \
666 ((desc)->a & 0x0ffff) | \
667 ((desc)->b & 0xf0000) )
669 #define GET_32BIT(desc) (((desc)->b >> 23) & 1)
670 #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
671 #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
672 #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
673 #define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
674 #define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
675 #define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
677 int do_get_thread_area(struct thread_struct *t, struct user_desc *u_info)
679 struct user_desc info;
680 struct n_desc_struct *desc;
681 int idx;
683 if (get_user(idx, &u_info->entry_number))
684 return -EFAULT;
685 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
686 return -EINVAL;
688 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
690 memset(&info, 0, sizeof(struct user_desc));
691 info.entry_number = idx;
692 info.base_addr = GET_BASE(desc);
693 info.limit = GET_LIMIT(desc);
694 info.seg_32bit = GET_32BIT(desc);
695 info.contents = GET_CONTENTS(desc);
696 info.read_exec_only = !GET_WRITABLE(desc);
697 info.limit_in_pages = GET_LIMIT_PAGES(desc);
698 info.seg_not_present = !GET_PRESENT(desc);
699 info.useable = GET_USEABLE(desc);
700 info.lm = GET_LONGMODE(desc);
702 if (copy_to_user(u_info, &info, sizeof(info)))
703 return -EFAULT;
704 return 0;
707 asmlinkage int sys_get_thread_area(struct user_desc *u_info)
709 return do_get_thread_area(&current->thread, u_info);
713 * Capture the user space registers if the task is not running (in user space)
715 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
717 struct pt_regs *pp, ptregs;
719 pp = (struct pt_regs *)(tsk->thread.rsp0);
720 --pp;
722 ptregs = *pp;
723 ptregs.cs &= 0xffff;
724 ptregs.ss &= 0xffff;
726 elf_core_copy_regs(regs, &ptregs);
728 return 1;