2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
11 * This file handles the architecture-dependent parts of process handling..
16 #include <linux/errno.h>
17 #include <linux/sched.h>
19 #include <linux/kernel.h>
21 #include <linux/elfcore.h>
22 #include <linux/smp.h>
23 #include <linux/smp_lock.h>
24 #include <linux/stddef.h>
25 #include <linux/slab.h>
26 #include <linux/vmalloc.h>
27 #include <linux/user.h>
28 #include <linux/a.out.h>
29 #include <linux/interrupt.h>
30 #include <linux/config.h>
31 #include <linux/version.h>
32 #include <linux/delay.h>
33 #include <linux/reboot.h>
34 #include <linux/init.h>
35 #include <linux/mc146818rtc.h>
36 #include <linux/module.h>
37 #include <linux/kallsyms.h>
38 #include <linux/ptrace.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
45 #include <asm/processor.h>
49 #ifdef CONFIG_MATH_EMULATION
50 #include <asm/math_emu.h>
53 #include <linux/irq.h>
54 #include <linux/err.h>
56 asmlinkage
void ret_from_fork(void) __asm__("ret_from_fork");
61 * Return saved PC of a blocked thread.
63 unsigned long thread_saved_pc(struct task_struct
*tsk
)
65 return ((unsigned long *)tsk
->thread
.esp
)[3];
69 * Powermanagement idle function, if any..
71 void (*pm_idle
)(void);
73 void disable_hlt(void)
78 EXPORT_SYMBOL(disable_hlt
);
85 EXPORT_SYMBOL(enable_hlt
);
88 * We use this if we don't have any better
91 void default_idle(void)
93 if (!hlt_counter
&& current_cpu_data
.hlt_works_ok
) {
103 * On SMP it's slightly faster (but much more power-consuming!)
104 * to poll the ->work.need_resched flag instead of waiting for the
105 * cross-CPU IPI to arrive. Use this option with caution.
107 static void poll_idle (void)
114 * Deal with another CPU just having chosen a thread to
117 oldval
= test_and_clear_thread_flag(TIF_NEED_RESCHED
);
120 set_thread_flag(TIF_POLLING_NRFLAG
);
126 : : "i"(_TIF_NEED_RESCHED
), "m" (current_thread_info()->flags
));
128 clear_thread_flag(TIF_POLLING_NRFLAG
);
135 * The idle thread. There's no useful work to be
136 * done, so just try to conserve power and have a
137 * low exit latency (ie sit in a loop waiting for
138 * somebody to say that they'd like to reschedule)
142 /* endless idle loop with no priority at all */
144 while (!need_resched()) {
147 * Mark this as an RCU critical section so that
148 * synchronize_kernel() in the unload path waits
149 * for our completion.
157 irq_stat
[smp_processor_id()].idle_timestamp
= jiffies
;
166 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
167 * which can obviate IPI to trigger checking of need_resched.
168 * We execute MONITOR against need_resched and enter optimized wait state
169 * through MWAIT. Whenever someone changes need_resched, we would be woken
170 * up from MWAIT (without an IPI).
172 static void mwait_idle(void)
176 if (!need_resched()) {
177 set_thread_flag(TIF_POLLING_NRFLAG
);
179 __monitor((void *)¤t_thread_info()->flags
, 0, 0);
183 } while (!need_resched());
184 clear_thread_flag(TIF_POLLING_NRFLAG
);
188 void __init
select_idle_routine(const struct cpuinfo_x86
*c
)
190 if (cpu_has(c
, X86_FEATURE_MWAIT
)) {
191 printk("monitor/mwait feature present.\n");
193 * Skip, if setup has overridden idle.
194 * One CPU supports mwait => All CPUs supports mwait
197 printk("using mwait in idle threads.\n");
198 pm_idle
= mwait_idle
;
203 static int __init
idle_setup (char *str
)
205 if (!strncmp(str
, "poll", 4)) {
206 printk("using polling idle threads.\n");
208 #ifdef CONFIG_X86_SMP
209 if (smp_num_siblings
> 1)
210 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
212 } else if (!strncmp(str
, "halt", 4)) {
213 printk("using halt in idle threads.\n");
214 pm_idle
= default_idle
;
220 __setup("idle=", idle_setup
);
222 void show_regs(struct pt_regs
* regs
)
224 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L;
227 printk("Pid: %d, comm: %20s\n", current
->pid
, current
->comm
);
228 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs
->xcs
,regs
->eip
, smp_processor_id());
229 print_symbol("EIP is at %s\n", regs
->eip
);
232 printk(" ESP: %04x:%08lx",0xffff & regs
->xss
,regs
->esp
);
233 printk(" EFLAGS: %08lx %s (%s)\n",regs
->eflags
, print_tainted(),UTS_RELEASE
);
234 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
235 regs
->eax
,regs
->ebx
,regs
->ecx
,regs
->edx
);
236 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
237 regs
->esi
, regs
->edi
, regs
->ebp
);
238 printk(" DS: %04x ES: %04x\n",
239 0xffff & regs
->xds
,0xffff & regs
->xes
);
241 __asm__("movl %%cr0, %0": "=r" (cr0
));
242 __asm__("movl %%cr2, %0": "=r" (cr2
));
243 __asm__("movl %%cr3, %0": "=r" (cr3
));
244 /* This could fault if %cr4 does not exist */
245 __asm__("1: movl %%cr4, %0 \n"
247 ".section __ex_table,\"a\" \n"
250 : "=r" (cr4
): "0" (0));
251 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0
, cr2
, cr3
, cr4
);
252 show_trace(NULL
, ®s
->esp
);
256 * This gets run with %ebx containing the
257 * function to call, and %edx containing
260 extern void kernel_thread_helper(void);
261 __asm__(".section .text\n"
263 "kernel_thread_helper:\n\t"
272 * Create a kernel thread
274 int kernel_thread(int (*fn
)(void *), void * arg
, unsigned long flags
)
278 memset(®s
, 0, sizeof(regs
));
280 regs
.ebx
= (unsigned long) fn
;
281 regs
.edx
= (unsigned long) arg
;
283 regs
.xds
= __USER_DS
;
284 regs
.xes
= __USER_DS
;
286 regs
.eip
= (unsigned long) kernel_thread_helper
;
287 regs
.xcs
= __KERNEL_CS
;
288 regs
.eflags
= X86_EFLAGS_IF
| X86_EFLAGS_SF
| X86_EFLAGS_PF
| 0x2;
290 /* Ok, create the new process.. */
291 return do_fork(flags
| CLONE_VM
| CLONE_UNTRACED
, 0, ®s
, 0, NULL
, NULL
);
295 * Free current thread data structures etc..
297 void exit_thread(void)
299 struct task_struct
*tsk
= current
;
300 struct thread_struct
*t
= &tsk
->thread
;
302 /* The process may have allocated an io port bitmap... nuke it. */
303 if (unlikely(NULL
!= t
->io_bitmap_ptr
)) {
305 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
307 kfree(t
->io_bitmap_ptr
);
308 t
->io_bitmap_ptr
= NULL
;
310 * Careful, clear this in the TSS too:
312 memset(tss
->io_bitmap
, 0xff, tss
->io_bitmap_max
);
313 t
->io_bitmap_max
= 0;
314 tss
->io_bitmap_owner
= NULL
;
315 tss
->io_bitmap_max
= 0;
316 tss
->io_bitmap_base
= INVALID_IO_BITMAP_OFFSET
;
321 void flush_thread(void)
323 struct task_struct
*tsk
= current
;
325 memset(tsk
->thread
.debugreg
, 0, sizeof(unsigned long)*8);
326 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
328 * Forget coprocessor state..
334 void release_thread(struct task_struct
*dead_task
)
337 // temporary debugging check
338 if (dead_task
->mm
->context
.size
) {
339 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
341 dead_task
->mm
->context
.ldt
,
342 dead_task
->mm
->context
.size
);
347 release_x86_irqs(dead_task
);
351 * This gets called before we allocate a new thread and copy
352 * the current task into it.
354 void prepare_to_copy(struct task_struct
*tsk
)
359 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long esp
,
360 unsigned long unused
,
361 struct task_struct
* p
, struct pt_regs
* regs
)
363 struct pt_regs
* childregs
;
364 struct task_struct
*tsk
;
367 childregs
= ((struct pt_regs
*) (THREAD_SIZE
+ (unsigned long) p
->thread_info
)) - 1;
370 childregs
->esp
= esp
;
371 p
->set_child_tid
= p
->clear_child_tid
= NULL
;
373 p
->thread
.esp
= (unsigned long) childregs
;
374 p
->thread
.esp0
= (unsigned long) (childregs
+1);
376 p
->thread
.eip
= (unsigned long) ret_from_fork
;
378 savesegment(fs
,p
->thread
.fs
);
379 savesegment(gs
,p
->thread
.gs
);
382 if (unlikely(NULL
!= tsk
->thread
.io_bitmap_ptr
)) {
383 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
384 if (!p
->thread
.io_bitmap_ptr
) {
385 p
->thread
.io_bitmap_max
= 0;
388 memcpy(p
->thread
.io_bitmap_ptr
, tsk
->thread
.io_bitmap_ptr
,
393 * Set a new TLS for the child thread?
395 if (clone_flags
& CLONE_SETTLS
) {
396 struct desc_struct
*desc
;
397 struct user_desc info
;
401 if (copy_from_user(&info
, (void __user
*)childregs
->esi
, sizeof(info
)))
404 if (LDT_empty(&info
))
407 idx
= info
.entry_number
;
408 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
411 desc
= p
->thread
.tls_array
+ idx
- GDT_ENTRY_TLS_MIN
;
412 desc
->a
= LDT_entry_a(&info
);
413 desc
->b
= LDT_entry_b(&info
);
418 if (err
&& p
->thread
.io_bitmap_ptr
) {
419 kfree(p
->thread
.io_bitmap_ptr
);
420 p
->thread
.io_bitmap_max
= 0;
426 * fill in the user structure for a core dump..
428 void dump_thread(struct pt_regs
* regs
, struct user
* dump
)
432 /* changed the size calculations - should hopefully work better. lbt */
433 dump
->magic
= CMAGIC
;
434 dump
->start_code
= 0;
435 dump
->start_stack
= regs
->esp
& ~(PAGE_SIZE
- 1);
436 dump
->u_tsize
= ((unsigned long) current
->mm
->end_code
) >> PAGE_SHIFT
;
437 dump
->u_dsize
= ((unsigned long) (current
->mm
->brk
+ (PAGE_SIZE
-1))) >> PAGE_SHIFT
;
438 dump
->u_dsize
-= dump
->u_tsize
;
440 for (i
= 0; i
< 8; i
++)
441 dump
->u_debugreg
[i
] = current
->thread
.debugreg
[i
];
443 if (dump
->start_stack
< TASK_SIZE
)
444 dump
->u_ssize
= ((unsigned long) (TASK_SIZE
- dump
->start_stack
)) >> PAGE_SHIFT
;
446 dump
->regs
.ebx
= regs
->ebx
;
447 dump
->regs
.ecx
= regs
->ecx
;
448 dump
->regs
.edx
= regs
->edx
;
449 dump
->regs
.esi
= regs
->esi
;
450 dump
->regs
.edi
= regs
->edi
;
451 dump
->regs
.ebp
= regs
->ebp
;
452 dump
->regs
.eax
= regs
->eax
;
453 dump
->regs
.ds
= regs
->xds
;
454 dump
->regs
.es
= regs
->xes
;
455 savesegment(fs
,dump
->regs
.fs
);
456 savesegment(gs
,dump
->regs
.gs
);
457 dump
->regs
.orig_eax
= regs
->orig_eax
;
458 dump
->regs
.eip
= regs
->eip
;
459 dump
->regs
.cs
= regs
->xcs
;
460 dump
->regs
.eflags
= regs
->eflags
;
461 dump
->regs
.esp
= regs
->esp
;
462 dump
->regs
.ss
= regs
->xss
;
464 dump
->u_fpvalid
= dump_fpu (regs
, &dump
->i387
);
468 * Capture the user space registers if the task is not running (in user space)
470 int dump_task_regs(struct task_struct
*tsk
, elf_gregset_t
*regs
)
472 struct pt_regs ptregs
;
474 ptregs
= *(struct pt_regs
*)
475 ((unsigned long)tsk
->thread_info
+THREAD_SIZE
- sizeof(ptregs
));
476 ptregs
.xcs
&= 0xffff;
477 ptregs
.xds
&= 0xffff;
478 ptregs
.xes
&= 0xffff;
479 ptregs
.xss
&= 0xffff;
481 elf_core_copy_regs(regs
, &ptregs
);
487 handle_io_bitmap(struct thread_struct
*next
, struct tss_struct
*tss
)
489 if (!next
->io_bitmap_ptr
) {
491 * Disable the bitmap via an invalid offset. We still cache
492 * the previous bitmap owner and the IO bitmap contents:
494 tss
->io_bitmap_base
= INVALID_IO_BITMAP_OFFSET
;
497 if (likely(next
== tss
->io_bitmap_owner
)) {
499 * Previous owner of the bitmap (hence the bitmap content)
500 * matches the next task, we dont have to do anything but
501 * to set a valid offset in the TSS:
503 tss
->io_bitmap_base
= IO_BITMAP_OFFSET
;
507 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
508 * and we let the task to get a GPF in case an I/O instruction
509 * is performed. The handler of the GPF will verify that the
510 * faulting task has a valid I/O bitmap and, it true, does the
511 * real copy and restart the instruction. This will save us
512 * redundant copies when the currently switched task does not
513 * perform any I/O during its timeslice.
515 tss
->io_bitmap_base
= INVALID_IO_BITMAP_OFFSET_LAZY
;
518 * This special macro can be used to load a debugging register
520 #define loaddebug(thread,register) \
521 __asm__("movl %0,%%db" #register \
523 :"r" (thread->debugreg[register]))
526 * switch_to(x,yn) should switch tasks from x to y.
528 * We fsave/fwait so that an exception goes off at the right time
529 * (as a call from the fsave or fwait in effect) rather than to
530 * the wrong process. Lazy FP saving no longer makes any sense
531 * with modern CPU's, and this simplifies a lot of things (SMP
532 * and UP become the same).
534 * NOTE! We used to use the x86 hardware context switching. The
535 * reason for not using it any more becomes apparent when you
536 * try to recover gracefully from saved state that is no longer
537 * valid (stale segment register values in particular). With the
538 * hardware task-switch, there is no way to fix up bad state in
539 * a reasonable manner.
541 * The fact that Intel documents the hardware task-switching to
542 * be slow is a fairly red herring - this code is not noticeably
543 * faster. However, there _is_ some room for improvement here,
544 * so the performance issues may eventually be a valid point.
545 * More important, however, is the fact that this allows us much
548 * The return value (in %eax) will be the "prev" task after
549 * the task-switch, and shows up in ret_from_fork in entry.S,
552 struct task_struct fastcall
* __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
554 struct thread_struct
*prev
= &prev_p
->thread
,
555 *next
= &next_p
->thread
;
556 int cpu
= smp_processor_id();
557 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
559 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
561 __unlazy_fpu(prev_p
);
564 * Reload esp0, LDT and the page table pointer:
566 load_esp0(tss
, next
);
569 * Load the per-thread Thread-Local Storage descriptor.
574 * Save away %fs and %gs. No need to save %es and %ds, as
575 * those are always kernel segments while inside the kernel.
577 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev
->fs
));
578 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev
->gs
));
581 * Restore %fs and %gs if needed.
583 if (unlikely(prev
->fs
| prev
->gs
| next
->fs
| next
->gs
)) {
584 loadsegment(fs
, next
->fs
);
585 loadsegment(gs
, next
->gs
);
589 * Now maybe reload the debug registers
591 if (unlikely(next
->debugreg
[7])) {
601 if (unlikely(prev
->io_bitmap_ptr
|| next
->io_bitmap_ptr
))
602 handle_io_bitmap(next
, tss
);
607 asmlinkage
int sys_fork(struct pt_regs regs
)
609 return do_fork(SIGCHLD
, regs
.esp
, ®s
, 0, NULL
, NULL
);
612 asmlinkage
int sys_clone(struct pt_regs regs
)
614 unsigned long clone_flags
;
616 int __user
*parent_tidptr
, *child_tidptr
;
618 clone_flags
= regs
.ebx
;
620 parent_tidptr
= (int __user
*)regs
.edx
;
621 child_tidptr
= (int __user
*)regs
.edi
;
624 return do_fork(clone_flags
, newsp
, ®s
, 0, parent_tidptr
, child_tidptr
);
628 * This is trivial, and on the face of it looks like it
629 * could equally well be done in user mode.
631 * Not so, for quite unobvious reasons - register pressure.
632 * In user mode vfork() cannot have a stack frame, and if
633 * done by calling the "clone()" system call directly, you
634 * do not have enough call-clobbered registers to hold all
635 * the information you need.
637 asmlinkage
int sys_vfork(struct pt_regs regs
)
639 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
.esp
, ®s
, 0, NULL
, NULL
);
643 * sys_execve() executes a new program.
645 asmlinkage
int sys_execve(struct pt_regs regs
)
650 filename
= getname((char __user
*) regs
.ebx
);
651 error
= PTR_ERR(filename
);
652 if (IS_ERR(filename
))
654 error
= do_execve(filename
,
655 (char __user
* __user
*) regs
.ecx
,
656 (char __user
* __user
*) regs
.edx
,
659 current
->ptrace
&= ~PT_DTRACE
;
660 /* Make sure we don't return using sysenter.. */
661 set_thread_flag(TIF_IRET
);
668 #define top_esp (THREAD_SIZE - sizeof(unsigned long))
669 #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
671 unsigned long get_wchan(struct task_struct
*p
)
673 unsigned long ebp
, esp
, eip
;
674 unsigned long stack_page
;
676 if (!p
|| p
== current
|| p
->state
== TASK_RUNNING
)
678 stack_page
= (unsigned long)p
->thread_info
;
680 if (!stack_page
|| esp
< stack_page
|| esp
> top_esp
+stack_page
)
682 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
683 ebp
= *(unsigned long *) esp
;
685 if (ebp
< stack_page
|| ebp
> top_ebp
+stack_page
)
687 eip
= *(unsigned long *) (ebp
+4);
688 if (!in_sched_functions(eip
))
690 ebp
= *(unsigned long *) ebp
;
691 } while (count
++ < 16);
696 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
698 static int get_free_idx(void)
700 struct thread_struct
*t
= ¤t
->thread
;
703 for (idx
= 0; idx
< GDT_ENTRY_TLS_ENTRIES
; idx
++)
704 if (desc_empty(t
->tls_array
+ idx
))
705 return idx
+ GDT_ENTRY_TLS_MIN
;
710 * Set a given TLS descriptor:
712 asmlinkage
int sys_set_thread_area(struct user_desc __user
*u_info
)
714 struct thread_struct
*t
= ¤t
->thread
;
715 struct user_desc info
;
716 struct desc_struct
*desc
;
719 if (copy_from_user(&info
, u_info
, sizeof(info
)))
721 idx
= info
.entry_number
;
724 * index -1 means the kernel should try to find and
725 * allocate an empty descriptor:
728 idx
= get_free_idx();
731 if (put_user(idx
, &u_info
->entry_number
))
735 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
738 desc
= t
->tls_array
+ idx
- GDT_ENTRY_TLS_MIN
;
741 * We must not get preempted while modifying the TLS.
745 if (LDT_empty(&info
)) {
749 desc
->a
= LDT_entry_a(&info
);
750 desc
->b
= LDT_entry_b(&info
);
760 * Get the current Thread-Local Storage area:
763 #define GET_BASE(desc) ( \
764 (((desc)->a >> 16) & 0x0000ffff) | \
765 (((desc)->b << 16) & 0x00ff0000) | \
766 ( (desc)->b & 0xff000000) )
768 #define GET_LIMIT(desc) ( \
769 ((desc)->a & 0x0ffff) | \
770 ((desc)->b & 0xf0000) )
772 #define GET_32BIT(desc) (((desc)->b >> 22) & 1)
773 #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
774 #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
775 #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
776 #define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
777 #define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
779 asmlinkage
int sys_get_thread_area(struct user_desc __user
*u_info
)
781 struct user_desc info
;
782 struct desc_struct
*desc
;
785 if (get_user(idx
, &u_info
->entry_number
))
787 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
790 desc
= current
->thread
.tls_array
+ idx
- GDT_ENTRY_TLS_MIN
;
792 info
.entry_number
= idx
;
793 info
.base_addr
= GET_BASE(desc
);
794 info
.limit
= GET_LIMIT(desc
);
795 info
.seg_32bit
= GET_32BIT(desc
);
796 info
.contents
= GET_CONTENTS(desc
);
797 info
.read_exec_only
= !GET_WRITABLE(desc
);
798 info
.limit_in_pages
= GET_LIMIT_PAGES(desc
);
799 info
.seg_not_present
= !GET_PRESENT(desc
);
800 info
.useable
= GET_USEABLE(desc
);
802 if (copy_to_user(u_info
, &info
, sizeof(info
)))