2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
19 #define __KERNEL_SYSCALLS__
22 #include <linux/compiler.h>
23 #include <linux/errno.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/smp_lock.h>
30 #include <linux/stddef.h>
31 #include <linux/unistd.h>
32 #include <linux/ptrace.h>
33 #include <linux/slab.h>
34 #include <linux/vmalloc.h>
35 #include <linux/user.h>
36 #include <linux/module.h>
37 #include <linux/a.out.h>
38 #include <linux/interrupt.h>
39 #include <linux/config.h>
40 #include <linux/delay.h>
41 #include <linux/init.h>
42 #include <linux/ctype.h>
43 #include <linux/slab.h>
45 #include <asm/uaccess.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
50 #include <asm/processor.h>
53 #include <asm/mmu_context.h>
55 #include <asm/prctl.h>
56 #include <asm/kdebug.h>
58 #include <linux/irq.h>
60 asmlinkage
extern void ret_from_fork(void);
61 int sys_arch_prctl(int code
, unsigned long addr
);
63 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
68 * Powermanagement idle function, if any..
70 void (*pm_idle
)(void);
72 void disable_hlt(void)
83 * We use this if we don't have any better
86 void default_idle(void)
98 * On SMP it's slightly faster (but much more power-consuming!)
99 * to poll the ->need_resched flag instead of waiting for the
100 * cross-CPU IPI to arrive. Use this option with caution.
102 static void poll_idle (void)
109 * Deal with another CPU just having chosen a thread to
112 oldval
= test_and_clear_thread_flag(TIF_NEED_RESCHED
);
115 set_thread_flag(TIF_POLLING_NRFLAG
);
122 "i" (_TIF_NEED_RESCHED
),
123 "m" (current_thread_info()->flags
));
130 * The idle thread. There's no useful work to be
131 * done, so just try to conserve power and have a
132 * low exit latency (ie sit in a loop waiting for
133 * somebody to say that they'd like to reschedule)
137 /* endless idle loop with no priority at all */
139 void (*idle
)(void) = pm_idle
;
142 while (!need_resched())
148 static int __init
idle_setup (char *str
)
150 if (!strncmp(str
, "poll", 4)) {
151 printk("using polling idle threads.\n");
158 __setup("idle=", idle_setup
);
161 /* Prints also some state that isn't saved in the pt_regs */
162 void show_regs(struct pt_regs
* regs
)
164 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
165 unsigned int fsindex
,gsindex
;
166 unsigned int ds
,cs
,es
;
170 printk("Pid: %d, comm: %.20s %s\n", current
->pid
, current
->comm
, print_tainted());
171 printk("RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->rip
);
172 printk_address(regs
->rip
);
173 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
, regs
->rsp
, regs
->eflags
);
174 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 regs
->rax
, regs
->rbx
, regs
->rcx
);
176 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 regs
->rdx
, regs
->rsi
, regs
->rdi
);
178 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
179 regs
->rbp
, regs
->r8
, regs
->r9
);
180 printk("R10: %016lx R11: %016lx R12: %016lx\n",
181 regs
->r10
, regs
->r11
, regs
->r12
);
182 printk("R13: %016lx R14: %016lx R15: %016lx\n",
183 regs
->r13
, regs
->r14
, regs
->r15
);
185 asm("movl %%ds,%0" : "=r" (ds
));
186 asm("movl %%cs,%0" : "=r" (cs
));
187 asm("movl %%es,%0" : "=r" (es
));
188 asm("movl %%fs,%0" : "=r" (fsindex
));
189 asm("movl %%gs,%0" : "=r" (gsindex
));
191 rdmsrl(MSR_FS_BASE
, fs
);
192 rdmsrl(MSR_GS_BASE
, gs
);
193 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
195 asm("movq %%cr0, %0": "=r" (cr0
));
196 asm("movq %%cr2, %0": "=r" (cr2
));
197 asm("movq %%cr3, %0": "=r" (cr3
));
198 asm("movq %%cr4, %0": "=r" (cr4
));
200 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
201 fs
,fsindex
,gs
,gsindex
,shadowgs
);
202 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
, es
, cr0
);
203 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
, cr4
);
206 extern void load_gs_index(unsigned);
209 * Free current thread data structures etc..
211 void exit_thread(void)
213 struct task_struct
*me
= current
;
214 if (me
->thread
.io_bitmap_ptr
) {
215 kfree(me
->thread
.io_bitmap_ptr
);
216 me
->thread
.io_bitmap_ptr
= NULL
;
217 (init_tss
+ smp_processor_id())->io_map_base
=
218 INVALID_IO_BITMAP_OFFSET
;
222 void flush_thread(void)
224 struct task_struct
*tsk
= current
;
226 memset(tsk
->thread
.debugreg
, 0, sizeof(unsigned long)*8);
227 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
229 * Forget coprocessor state..
235 void release_thread(struct task_struct
*dead_task
)
238 if (dead_task
->mm
->context
.size
) {
239 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
241 dead_task
->mm
->context
.ldt
,
242 dead_task
->mm
->context
.size
);
248 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long rsp
,
249 unsigned long unused
,
250 struct task_struct
* p
, struct pt_regs
* regs
)
252 struct pt_regs
* childregs
;
253 struct task_struct
*me
= current
;
255 childregs
= ((struct pt_regs
*) (THREAD_SIZE
+ (unsigned long) p
->thread_info
)) - 1;
260 childregs
->rsp
= rsp
;
262 childregs
->rsp
= (unsigned long)childregs
;
264 p
->set_child_tid
= p
->clear_child_tid
= NULL
;
266 p
->thread
.rsp
= (unsigned long) childregs
;
267 p
->thread
.rsp0
= (unsigned long) (childregs
+1);
268 p
->thread
.userrsp
= current
->thread
.userrsp
;
270 p
->thread
.rip
= (unsigned long) ret_from_fork
;
272 p
->thread
.fs
= me
->thread
.fs
;
273 p
->thread
.gs
= me
->thread
.gs
;
275 asm("movl %%gs,%0" : "=m" (p
->thread
.gsindex
));
276 asm("movl %%fs,%0" : "=m" (p
->thread
.fsindex
));
277 asm("movl %%es,%0" : "=m" (p
->thread
.es
));
278 asm("movl %%ds,%0" : "=m" (p
->thread
.ds
));
281 p
->thread
.i387
= current
->thread
.i387
;
283 if (unlikely(me
->thread
.io_bitmap_ptr
!= NULL
)) {
284 p
->thread
.io_bitmap_ptr
= kmalloc((IO_BITMAP_SIZE
+1)*4, GFP_KERNEL
);
285 if (!p
->thread
.io_bitmap_ptr
)
287 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
288 (IO_BITMAP_SIZE
+1)*4);
292 * Set a new TLS for the child thread?
294 if (clone_flags
& CLONE_SETTLS
) {
295 struct n_desc_struct
*desc
;
296 struct user_desc info
;
299 if (copy_from_user(&info
, test_thread_flag(TIF_IA32
) ?
300 (void *)childregs
->rsi
:
301 (void *)childregs
->rdx
, sizeof(info
)))
303 if (LDT_empty(&info
))
306 idx
= info
.entry_number
;
307 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
310 desc
= (struct n_desc_struct
*)(p
->thread
.tls_array
) + idx
- GDT_ENTRY_TLS_MIN
;
311 desc
->a
= LDT_entry_a(&info
);
312 desc
->b
= LDT_entry_b(&info
);
319 * This special macro can be used to load a debugging register
321 #define loaddebug(thread,register) \
322 set_debug(thread->debugreg[register], register)
325 * switch_to(x,y) should switch tasks from x to y.
327 * This could still be optimized:
328 * - fold all the options into a flag word and test it with a single test.
329 * - could test fs/gs bitsliced
331 void __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
333 struct thread_struct
*prev
= &prev_p
->thread
,
334 *next
= &next_p
->thread
;
335 int cpu
= smp_processor_id();
336 struct tss_struct
*tss
= init_tss
+ cpu
;
341 * Reload esp0, LDT and the page table pointer:
343 tss
->rsp0
= next
->rsp0
;
347 * This won't pick up thread selector changes, but I guess that is ok.
349 asm volatile("movl %%es,%0" : "=m" (prev
->es
));
350 if (unlikely(next
->es
| prev
->es
))
351 loadsegment(es
, next
->es
);
353 asm volatile ("movl %%ds,%0" : "=m" (prev
->ds
));
354 if (unlikely(next
->ds
| prev
->ds
))
355 loadsegment(ds
, next
->ds
);
364 asm volatile("movl %%fs,%0" : "=g" (fsindex
));
365 /* segment register != 0 always requires a reload.
366 also reload when it has changed.
367 when prev process used 64bit base always reload
368 to avoid an information leak. */
369 if (unlikely((fsindex
| next
->fsindex
) || prev
->fs
))
370 loadsegment(fs
, next
->fsindex
);
371 /* check if the user changed the selector
372 if yes clear 64bit base. */
373 if (unlikely(fsindex
!= prev
->fsindex
))
375 /* when next process has a 64bit base use it */
377 wrmsrl(MSR_FS_BASE
, next
->fs
);
378 prev
->fsindex
= fsindex
;
382 asm volatile("movl %%gs,%0" : "=g" (gsindex
));
383 if (unlikely((gsindex
| next
->gsindex
) || prev
->gs
))
384 load_gs_index(next
->gsindex
);
385 if (unlikely(gsindex
!= prev
->gsindex
))
388 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
389 prev
->gsindex
= gsindex
;
393 * Switch the PDA context.
395 prev
->userrsp
= read_pda(oldrsp
);
396 write_pda(oldrsp
, next
->userrsp
);
397 write_pda(pcurrent
, next_p
);
398 write_pda(kernelstack
, (unsigned long)next_p
->thread_info
+ THREAD_SIZE
- PDA_STACKOFFSET
);
402 * Now maybe reload the debug registers
404 if (unlikely(next
->debugreg
[7])) {
416 * Handle the IO bitmap
418 if (unlikely(prev
->io_bitmap_ptr
|| next
->io_bitmap_ptr
)) {
419 if (next
->io_bitmap_ptr
) {
421 * 4 cachelines copy ... not good, but not that
422 * bad either. Anyone got something better?
423 * This only affects processes which use ioperm().
425 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
426 IO_BITMAP_SIZE
*sizeof(u32
));
427 tss
->io_map_base
= IO_BITMAP_OFFSET
;
430 * a bitmap offset pointing outside of the TSS limit
431 * causes a nicely controllable SIGSEGV if a process
432 * tries to use a port IO instruction. The first
433 * sys_ioperm() call sets up the bitmap properly.
435 tss
->io_map_base
= INVALID_IO_BITMAP_OFFSET
;
441 * sys_execve() executes a new program.
444 long sys_execve(char *name
, char **argv
,char **envp
, struct pt_regs regs
)
449 filename
= getname(name
);
450 error
= PTR_ERR(filename
);
451 if (IS_ERR(filename
))
453 error
= do_execve(filename
, argv
, envp
, ®s
);
455 current
->ptrace
&= ~PT_DTRACE
;
460 void set_personality_64bit(void)
462 /* inherit personality from parent */
464 /* Make sure to be in 64bit mode */
465 clear_thread_flag(TIF_IA32
);
468 asmlinkage
long sys_fork(struct pt_regs regs
)
470 struct task_struct
*p
;
471 p
= do_fork(SIGCHLD
, regs
.rsp
, ®s
, 0, NULL
, NULL
);
472 return IS_ERR(p
) ? PTR_ERR(p
) : p
->pid
;
475 asmlinkage
long sys_clone(unsigned long clone_flags
, unsigned long newsp
, void *parent_tid
, void *child_tid
, struct pt_regs regs
)
477 struct task_struct
*p
;
480 p
= do_fork(clone_flags
& ~CLONE_IDLETASK
, newsp
, ®s
, 0,
481 parent_tid
, child_tid
);
482 return IS_ERR(p
) ? PTR_ERR(p
) : p
->pid
;
486 * This is trivial, and on the face of it looks like it
487 * could equally well be done in user mode.
489 * Not so, for quite unobvious reasons - register pressure.
490 * In user mode vfork() cannot have a stack frame, and if
491 * done by calling the "clone()" system call directly, you
492 * do not have enough call-clobbered registers to hold all
493 * the information you need.
495 asmlinkage
long sys_vfork(struct pt_regs regs
)
497 struct task_struct
*p
;
498 p
= do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
.rsp
, ®s
, 0,
500 return IS_ERR(p
) ? PTR_ERR(p
) : p
->pid
;
504 * These bracket the sleeping functions..
506 extern void scheduling_functions_start_here(void);
507 extern void scheduling_functions_end_here(void);
508 #define first_sched ((unsigned long) scheduling_functions_start_here)
509 #define last_sched ((unsigned long) scheduling_functions_end_here)
511 unsigned long get_wchan(struct task_struct
*p
)
516 if (!p
|| p
== current
|| p
->state
==TASK_RUNNING
)
518 if (p
->thread
.rsp
< (u64
)p
|| p
->thread
.rsp
> (u64
)p
+ THREAD_SIZE
)
520 fp
= *(u64
*)(p
->thread
.rsp
);
522 if (fp
< (unsigned long)p
|| fp
> (unsigned long)p
+THREAD_SIZE
)
524 rip
= *(u64
*)(fp
+8);
525 if (rip
< first_sched
|| rip
>= last_sched
)
528 } while (count
++ < 16);
534 int sys_arch_prctl(int code
, unsigned long addr
)
541 /* For now. We still have one unsolved bug in long gs base context
545 if (addr
>= TASK_SIZE
)
548 load_gs_index(__USER_LONGBASE
);
549 current
->thread
.gsindex
= __USER_LONGBASE
;
550 current
->thread
.gs
= addr
;
551 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
556 /* Not strictly needed for fs, but do it for symmetry
558 if (addr
>= TASK_SIZE
)
561 asm volatile("movl %0,%%fs" :: "r" (__USER_LONGBASE
));
562 current
->thread
.fsindex
= __USER_LONGBASE
;
563 current
->thread
.fs
= addr
;
564 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
568 /* Returned value may not be correct when the user changed fs/gs */
570 ret
= put_user(current
->thread
.fs
, (unsigned long *)addr
);
574 ret
= put_user(current
->thread
.gs
, (unsigned long *)addr
);
586 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
588 static int get_free_idx(void)
590 struct thread_struct
*t
= ¤t
->thread
;
593 for (idx
= 0; idx
< GDT_ENTRY_TLS_ENTRIES
; idx
++)
594 if (desc_empty((struct n_desc_struct
*)(t
->tls_array
) + idx
))
595 return idx
+ GDT_ENTRY_TLS_MIN
;
600 * Set a given TLS descriptor:
601 * When you want addresses > 32bit use arch_prctl()
603 int do_set_thread_area(struct thread_struct
*t
, struct user_desc
*u_info
)
605 struct user_desc info
;
606 struct n_desc_struct
*desc
;
609 if (copy_from_user(&info
, u_info
, sizeof(info
)))
612 idx
= info
.entry_number
;
615 * index -1 means the kernel should try to find and
616 * allocate an empty descriptor:
619 idx
= get_free_idx();
622 if (put_user(idx
, &u_info
->entry_number
))
626 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
629 desc
= ((struct n_desc_struct
*)t
->tls_array
) + idx
- GDT_ENTRY_TLS_MIN
;
632 * We must not get preempted while modifying the TLS.
636 if (LDT_empty(&info
)) {
640 desc
->a
= LDT_entry_a(&info
);
641 desc
->b
= LDT_entry_b(&info
);
643 if (t
== ¤t
->thread
)
650 asmlinkage
int sys_set_thread_area(struct user_desc
*u_info
)
652 return do_set_thread_area(¤t
->thread
, u_info
);
657 * Get the current Thread-Local Storage area:
660 #define GET_BASE(desc) ( \
661 (((desc)->a >> 16) & 0x0000ffff) | \
662 (((desc)->b << 16) & 0x00ff0000) | \
663 ( (desc)->b & 0xff000000) )
665 #define GET_LIMIT(desc) ( \
666 ((desc)->a & 0x0ffff) | \
667 ((desc)->b & 0xf0000) )
669 #define GET_32BIT(desc) (((desc)->b >> 23) & 1)
670 #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
671 #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
672 #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
673 #define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
674 #define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
675 #define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
677 int do_get_thread_area(struct thread_struct
*t
, struct user_desc
*u_info
)
679 struct user_desc info
;
680 struct n_desc_struct
*desc
;
683 if (get_user(idx
, &u_info
->entry_number
))
685 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
688 desc
= ((struct n_desc_struct
*)t
->tls_array
) + idx
- GDT_ENTRY_TLS_MIN
;
690 memset(&info
, 0, sizeof(struct user_desc
));
691 info
.entry_number
= idx
;
692 info
.base_addr
= GET_BASE(desc
);
693 info
.limit
= GET_LIMIT(desc
);
694 info
.seg_32bit
= GET_32BIT(desc
);
695 info
.contents
= GET_CONTENTS(desc
);
696 info
.read_exec_only
= !GET_WRITABLE(desc
);
697 info
.limit_in_pages
= GET_LIMIT_PAGES(desc
);
698 info
.seg_not_present
= !GET_PRESENT(desc
);
699 info
.useable
= GET_USEABLE(desc
);
700 info
.lm
= GET_LONGMODE(desc
);
702 if (copy_to_user(u_info
, &info
, sizeof(info
)))
707 asmlinkage
int sys_get_thread_area(struct user_desc
*u_info
)
709 return do_get_thread_area(¤t
->thread
, u_info
);
713 * Capture the user space registers if the task is not running (in user space)
715 int dump_task_regs(struct task_struct
*tsk
, elf_gregset_t
*regs
)
717 struct pt_regs
*pp
, ptregs
;
719 pp
= (struct pt_regs
*)(tsk
->thread
.rsp0
);
726 elf_core_copy_regs(regs
, &ptregs
);