2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
8 * This file handles the architecture-dependent parts of process handling..
11 #define __KERNEL_SYSCALLS__
14 #include <linux/errno.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/stddef.h>
21 #include <linux/unistd.h>
22 #include <linux/ptrace.h>
23 #include <linux/malloc.h>
24 #include <linux/vmalloc.h>
25 #include <linux/user.h>
26 #include <linux/a.out.h>
27 #include <linux/interrupt.h>
28 #include <linux/config.h>
29 #include <linux/delay.h>
30 #include <linux/reboot.h>
31 #include <linux/init.h>
32 #if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF)
33 #include <linux/apm_bios.h>
36 #include <asm/uaccess.h>
37 #include <asm/pgtable.h>
38 #include <asm/system.h>
41 #include <asm/processor.h>
43 #include <asm/mmu_context.h>
44 #ifdef CONFIG_MATH_EMULATION
45 #include <asm/math_emu.h>
50 spinlock_t semaphore_wake_lock
= SPIN_LOCK_UNLOCKED
;
52 asmlinkage
void ret_from_fork(void) __asm__("ret_from_fork");
55 extern int apm_do_idle(void);
56 extern void apm_do_busy(void);
59 static int hlt_counter
=0;
61 #define HARD_IDLE_TIMEOUT (HZ / 3)
63 void disable_hlt(void)
75 static void hard_idle(void)
77 while (!current
->need_resched
) {
78 if (boot_cpu_data
.hlt_works_ok
&& !hlt_counter
) {
80 /* If the APM BIOS is not enabled, or there
81 is an error calling the idle routine, we
82 should hlt if possible. We need to check
83 need_resched again because an interrupt
84 may have occurred in apm_do_idle(). */
86 if (!apm_do_idle() && !current
->need_resched
)
93 if (current
->need_resched
)
103 * The idle loop on a uniprocessor i386..
105 static int cpu_idle(void *unused
)
108 unsigned long start_idle
= 0;
110 /* endless idle loop with no priority at all */
111 current
->priority
= 0;
112 current
->counter
= -100;
117 start_idle
= jiffies
;
119 if (jiffies
- start_idle
> HARD_IDLE_TIMEOUT
)
122 if (boot_cpu_data
.hlt_works_ok
&& !hlt_counter
&& !current
->need_resched
)
126 work
= current
->need_resched
;
135 * This is being executed in task 0 'user space'.
138 int cpu_idle(void *unused
)
140 /* endless idle loop with no priority at all */
141 current
->priority
= 0;
142 current
->counter
= -100;
146 if (current_cpu_data
.hlt_works_ok
&& !hlt_counter
&&
147 !current
->need_resched
)
150 * although we are an idle CPU, we do not want to
151 * get into the scheduler unnecessarily.
153 if (current
->need_resched
) {
162 asmlinkage
int sys_idle(void)
164 if (current
->pid
!= 0)
171 * This routine reboots the machine by asking the keyboard
172 * controller to pulse the reset-line low. We try that for a while,
173 * and if it doesn't work, we do some other stupid things.
176 static long no_idt
[2] = {0, 0};
177 static int reboot_mode
= 0;
178 static int reboot_thru_bios
= 0;
180 __initfunc(void reboot_setup(char *str
, int *ints
))
184 case 'w': /* "warm" reboot (no memory testing etc) */
185 reboot_mode
= 0x1234;
187 case 'c': /* "cold" reboot (with memory testing etc) */
190 case 'b': /* "bios" reboot by jumping through the BIOS */
191 reboot_thru_bios
= 1;
193 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
194 reboot_thru_bios
= 0;
197 if((str
= strchr(str
,',')) != NULL
)
205 /* The following code and data reboots the machine by switching to real
206 mode and jumping to the BIOS reset entry point, as if the CPU has
207 really been reset. The previous version asked the keyboard
208 controller to pulse the CPU reset line, which is more thorough, but
209 doesn't work with at least one type of 486 motherboard. It is easy
210 to stop this code working; hence the copious comments. */
212 static unsigned long long
213 real_mode_gdt_entries
[3] =
215 0x0000000000000000ULL
, /* Null descriptor */
216 0x00009a000000ffffULL
, /* 16-bit real-mode 64k code at 0x00000000 */
217 0x000092000100ffffULL
/* 16-bit real-mode 64k data at 0x00000100 */
222 unsigned short size
__attribute__ ((packed
));
223 unsigned long long * base
__attribute__ ((packed
));
225 real_mode_gdt
= { sizeof (real_mode_gdt_entries
) - 1, real_mode_gdt_entries
},
226 real_mode_idt
= { 0x3ff, 0 };
228 /* This is 16-bit protected mode code to disable paging and the cache,
229 switch to real mode and jump to the BIOS reset code.
231 The instruction that switches to real mode by writing to CR0 must be
232 followed immediately by a far jump instruction, which set CS to a
233 valid value for real mode, and flushes the prefetch queue to avoid
234 running instructions that have already been decoded in protected
237 Clears all the flags except ET, especially PG (paging), PE
238 (protected-mode enable) and TS (task switch for coprocessor state
239 save). Flushes the TLB after paging has been disabled. Sets CD and
240 NW, to disable the cache on a 486, and invalidates the cache. This
241 is more like the state of a 486 after reset. I don't know if
242 something else should be done for other chips.
244 More could be done here to set up the registers as if a CPU reset had
245 occurred; hopefully real BIOSs don't assume much. */
247 static unsigned char real_mode_switch
[] =
249 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
250 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
251 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
252 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
253 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
254 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
255 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
256 0x74, 0x02, /* jz f */
257 0x0f, 0x08, /* invd */
258 0x24, 0x10, /* f: andb $0x10,al */
259 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
260 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
263 static inline void kb_wait(void)
267 for (i
=0; i
<0x10000; i
++)
268 if ((inb_p(0x64) & 0x02) == 0)
272 void machine_restart(char * __unused
)
276 * turn off the IO-APIC, so we can do a clean reboot
281 if(!reboot_thru_bios
) {
282 /* rebooting needs to touch the page at absolute addr 0 */
283 *((unsigned short *)__va(0x472)) = reboot_mode
;
286 for (i
=0; i
<100; i
++) {
289 outb(0xfe,0x64); /* pulse reset low */
292 /* That didn't work - force a triple fault.. */
293 __asm__
__volatile__("lidt %0": :"m" (no_idt
));
294 __asm__
__volatile__("int3");
300 /* Write zero to CMOS register number 0x0f, which the BIOS POST
301 routine will recognize as telling it to do a proper reboot. (Well
302 that's what this book in front of me says -- it may only apply to
303 the Phoenix BIOS though, it's not clear). At the same time,
304 disable NMIs by setting the top bit in the CMOS address register,
305 as we're about to do peculiar things to the CPU. I'm not sure if
306 `outb_p' is needed instead of just `outb'. Use it to be on the
312 /* Remap the kernel at virtual address zero, as well as offset zero
313 from the kernel segment. This assumes the kernel segment starts at
314 virtual address PAGE_OFFSET. */
316 memcpy (swapper_pg_dir
, swapper_pg_dir
+ USER_PGD_PTRS
,
317 sizeof (swapper_pg_dir
[0]) * KERNEL_PGD_PTRS
);
319 /* Make sure the first page is mapped to the start of physical memory.
320 It is normally not mapped, to trap kernel NULL pointer dereferences. */
322 pg0
[0] = _PAGE_RW
| _PAGE_PRESENT
;
325 * Use `swapper_pg_dir' as our page directory. We bother with
326 * `SET_PAGE_DIR' because although might be rebooting, but if we change
327 * the way we set root page dir in the future, then we wont break a
328 * seldom used feature ;)
331 current
->mm
->pgd
= swapper_pg_dir
;
334 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
335 this on booting to tell it to "Bypass memory test (also warm
336 boot)". This seems like a fairly standard thing that gets set by
337 REBOOT.COM programs, and the previous reset routine did this
340 *((unsigned short *)0x472) = reboot_mode
;
342 /* For the switch to real mode, copy some code to low memory. It has
343 to be in the first 64k because it is running in 16-bit mode, and it
344 has to have the same physical and virtual address, because it turns
345 off paging. Copy it near the end of the first page, out of the way
346 of BIOS variables. */
348 memcpy ((void *) (0x1000 - sizeof (real_mode_switch
)),
349 real_mode_switch
, sizeof (real_mode_switch
));
351 /* Set up the IDT for real mode. */
353 __asm__
__volatile__ ("lidt %0" : : "m" (real_mode_idt
));
355 /* Set up a GDT from which we can load segment descriptors for real
356 mode. The GDT is not used in real mode; it is just needed here to
357 prepare the descriptors. */
359 __asm__
__volatile__ ("lgdt %0" : : "m" (real_mode_gdt
));
361 /* Load the data segment registers, and thus the descriptors ready for
362 real mode. The base address of each segment is 0x100, 16 times the
363 selector value being loaded here. This is so that the segment
364 registers don't have to be reloaded after switching to real mode:
365 the values are consistent for real mode operation already. */
367 __asm__
__volatile__ ("movl $0x0010,%%eax\n"
372 "\tmovl %%ax,%%ss" : : : "eax");
374 /* Jump to the 16-bit code that we copied earlier. It disables paging
375 and the cache, switches to real mode, and jumps to the BIOS reset
378 __asm__
__volatile__ ("ljmp $0x0008,%0"
380 : "i" ((void *) (0x1000 - sizeof (real_mode_switch
))));
383 void machine_halt(void)
387 void machine_power_off(void)
389 #if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF)
395 void show_regs(struct pt_regs
* regs
)
397 long cr0
= 0L, cr2
= 0L, cr3
= 0L;
400 printk("EIP: %04x:[<%08lx>]",0xffff & regs
->xcs
,regs
->eip
);
402 printk(" ESP: %04x:%08lx",0xffff & regs
->xss
,regs
->esp
);
403 printk(" EFLAGS: %08lx\n",regs
->eflags
);
404 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
405 regs
->eax
,regs
->ebx
,regs
->ecx
,regs
->edx
);
406 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
407 regs
->esi
, regs
->edi
, regs
->ebp
);
408 printk(" DS: %04x ES: %04x\n",
409 0xffff & regs
->xds
,0xffff & regs
->xes
);
411 __asm__("movl %%cr0, %0": "=r" (cr0
));
412 __asm__("movl %%cr2, %0": "=r" (cr2
));
413 __asm__("movl %%cr3, %0": "=r" (cr3
));
414 printk("CR0: %08lx CR2: %08lx CR3: %08lx\n", cr0
, cr2
, cr3
);
418 * Allocation and freeing of basic task resources.
420 * NOTE! The task struct and the stack go together
422 * The task structure is a two-page thing, and as such
423 * not reliable to allocate using the basic page alloc
424 * functions. We have a small cache of structures for
425 * when the allocations fail..
427 * This extra buffer essentially acts to make for less
428 * "jitter" in the allocations..
430 * On SMP we don't do this right now because:
431 * - we aren't holding any locks when called, and we might
432 * as well just depend on the generic memory management
433 * to do proper locking for us instead of complicating it
435 * - if you use SMP you have a beefy enough machine that
436 * this shouldn't matter..
439 #define EXTRA_TASK_STRUCT 16
440 static struct task_struct
* task_struct_stack
[EXTRA_TASK_STRUCT
];
441 static int task_struct_stack_ptr
= -1;
444 struct task_struct
* alloc_task_struct(void)
446 #ifndef EXTRA_TASK_STRUCT
447 return (struct task_struct
*) __get_free_pages(GFP_KERNEL
,1);
450 struct task_struct
*ret
;
452 index
= task_struct_stack_ptr
;
453 if (index
>= EXTRA_TASK_STRUCT
/2)
455 ret
= (struct task_struct
*) __get_free_pages(GFP_KERNEL
,1);
457 index
= task_struct_stack_ptr
;
460 ret
= task_struct_stack
[index
];
461 task_struct_stack_ptr
= index
-1;
468 void free_task_struct(struct task_struct
*p
)
470 #ifdef EXTRA_TASK_STRUCT
471 int index
= task_struct_stack_ptr
+1;
473 if (index
< EXTRA_TASK_STRUCT
) {
474 task_struct_stack
[index
] = p
;
475 task_struct_stack_ptr
= index
;
478 free_pages((unsigned long) p
, 1);
482 * No need to lock the MM as we are the last user
484 void release_segments(struct mm_struct
*mm
)
486 void * ldt
= mm
->segments
;
494 * special case, when we release the LDT from under
495 * the running CPU. Other CPUs cannot possibly use
496 * this LDT as we were getting here through mmput() ...
498 if (mm
== current
->mm
)
501 * Nobody anymore uses the LDT, we can free it:
507 void forget_segments(void)
509 /* forget local segments */
510 __asm__
__volatile__("movl %w0,%%fs ; movl %w0,%%gs"
515 * Load the LDT entry of init_task.
521 * Create a kernel thread
523 int kernel_thread(int (*fn
)(void *), void * arg
, unsigned long flags
)
527 __asm__
__volatile__(
528 "movl %%esp,%%esi\n\t"
529 "int $0x80\n\t" /* Linux/i386 system call */
530 "cmpl %%esp,%%esi\n\t" /* child or parent? */
531 "je 1f\n\t" /* parent - jump */
532 /* Load the argument into eax, and push it. That way, it does
533 * not matter whether the called function is compiled with
534 * -mregparm or not. */
537 "call *%5\n\t" /* call fn */
538 "movl %3,%0\n\t" /* exit */
541 :"=&a" (retval
), "=&S" (d0
)
542 :"0" (__NR_clone
), "i" (__NR_exit
),
544 "b" (flags
| CLONE_VM
)
550 * Free current thread data structures etc..
552 void exit_thread(void)
554 /* nothing to do ... */
557 void flush_thread(void)
559 struct task_struct
*tsk
= current
;
561 memset(tsk
->thread
.debugreg
, 0, sizeof(unsigned long)*8);
563 * Forget coprocessor state..
569 void release_thread(struct task_struct
*dead_task
)
572 void * ldt
= dead_task
->mm
->segments
;
574 // temporary debugging check
576 printk("WARNING: dead process %8s still has LDT? <%p>\n",
577 dead_task
->comm
, ldt
);
584 * If new_mm is NULL, we're being called to set up the LDT for
585 * a clone task: this is easy since the clone is not running yet.
586 * otherwise we copy the old segment into a new segment.
588 * we do not have to muck with descriptors here, that is
589 * done in __switch_to() and get_mmu_context().
591 void copy_segments(struct task_struct
*p
, struct mm_struct
*new_mm
)
593 struct mm_struct
* old_mm
= current
->mm
;
594 void * old_ldt
= old_mm
->segments
, * ldt
= old_ldt
;
596 if (!old_mm
->segments
) {
598 * default LDT - use the one from init_task
601 new_mm
->segments
= NULL
;
607 * Completely new LDT, we initialize it from the parent:
609 ldt
= vmalloc(LDT_ENTRIES
*LDT_ENTRY_SIZE
);
611 printk(KERN_WARNING
"ldt allocation failed\n");
613 memcpy(ldt
, old_ldt
, LDT_ENTRIES
*LDT_ENTRY_SIZE
);
614 new_mm
->segments
= ldt
;
622 #define savesegment(seg,value) \
623 asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
625 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long esp
,
626 struct task_struct
* p
, struct pt_regs
* regs
)
628 struct pt_regs
* childregs
;
630 childregs
= ((struct pt_regs
*) (THREAD_SIZE
+ (unsigned long) p
)) - 1;
633 childregs
->esp
= esp
;
635 p
->thread
.esp
= (unsigned long) childregs
;
636 p
->thread
.esp0
= (unsigned long) (childregs
+1);
638 p
->thread
.eip
= (unsigned long) ret_from_fork
;
640 savesegment(fs
,p
->thread
.fs
);
641 savesegment(gs
,p
->thread
.gs
);
644 p
->thread
.i387
= current
->thread
.i387
;
650 * fill in the FPU structure for a core dump.
652 int dump_fpu (struct pt_regs
* regs
, struct user_i387_struct
* fpu
)
655 struct task_struct
*tsk
= current
;
657 fpvalid
= tsk
->used_math
;
660 memcpy(fpu
,&tsk
->thread
.i387
.hard
,sizeof(*fpu
));
667 * fill in the user structure for a core dump..
669 void dump_thread(struct pt_regs
* regs
, struct user
* dump
)
673 /* changed the size calculations - should hopefully work better. lbt */
674 dump
->magic
= CMAGIC
;
675 dump
->start_code
= 0;
676 dump
->start_stack
= regs
->esp
& ~(PAGE_SIZE
- 1);
677 dump
->u_tsize
= ((unsigned long) current
->mm
->end_code
) >> PAGE_SHIFT
;
678 dump
->u_dsize
= ((unsigned long) (current
->mm
->brk
+ (PAGE_SIZE
-1))) >> PAGE_SHIFT
;
679 dump
->u_dsize
-= dump
->u_tsize
;
681 for (i
= 0; i
< 8; i
++)
682 dump
->u_debugreg
[i
] = current
->thread
.debugreg
[i
];
684 if (dump
->start_stack
< TASK_SIZE
)
685 dump
->u_ssize
= ((unsigned long) (TASK_SIZE
- dump
->start_stack
)) >> PAGE_SHIFT
;
687 dump
->regs
.ebx
= regs
->ebx
;
688 dump
->regs
.ecx
= regs
->ecx
;
689 dump
->regs
.edx
= regs
->edx
;
690 dump
->regs
.esi
= regs
->esi
;
691 dump
->regs
.edi
= regs
->edi
;
692 dump
->regs
.ebp
= regs
->ebp
;
693 dump
->regs
.eax
= regs
->eax
;
694 dump
->regs
.ds
= regs
->xds
;
695 dump
->regs
.es
= regs
->xes
;
696 savesegment(fs
,dump
->regs
.fs
);
697 savesegment(gs
,dump
->regs
.gs
);
698 dump
->regs
.orig_eax
= regs
->orig_eax
;
699 dump
->regs
.eip
= regs
->eip
;
700 dump
->regs
.cs
= regs
->xcs
;
701 dump
->regs
.eflags
= regs
->eflags
;
702 dump
->regs
.esp
= regs
->esp
;
703 dump
->regs
.ss
= regs
->xss
;
705 dump
->u_fpvalid
= dump_fpu (regs
, &dump
->i387
);
709 * This special macro can be used to load a debugging register
711 #define loaddebug(thread,register) \
712 __asm__("movl %0,%%db" #register \
714 :"r" (thread->debugreg[register]))
717 * switch_to(x,yn) should switch tasks from x to y.
719 * We fsave/fwait so that an exception goes off at the right time
720 * (as a call from the fsave or fwait in effect) rather than to
721 * the wrong process. Lazy FP saving no longer makes any sense
722 * with modern CPU's, and this simplifies a lot of things (SMP
723 * and UP become the same).
725 * NOTE! We used to use the x86 hardware context switching. The
726 * reason for not using it any more becomes apparent when you
727 * try to recover gracefully from saved state that is no longer
728 * valid (stale segment register values in particular). With the
729 * hardware task-switch, there is no way to fix up bad state in
730 * a reasonable manner.
732 * The fact that Intel documents the hardware task-switching to
733 * be slow is a fairly red herring - this code is not noticeably
734 * faster. However, there _is_ some room for improvement here,
735 * so the performance issues may eventually be a valid point.
736 * More important, however, is the fact that this allows us much
739 extern int cpus_initialized
;
740 void __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
742 struct thread_struct
*prev
= &prev_p
->thread
,
743 *next
= &next_p
->thread
;
744 struct tss_struct
*tss
= init_tss
+ smp_processor_id();
749 * Reload esp0, LDT and the page table pointer:
751 tss
->esp0
= next
->esp0
;
754 * Save away %fs and %gs. No need to save %es and %ds, as
755 * those are always kernel segments while inside the kernel.
757 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev
->fs
));
758 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev
->gs
));
761 * Re-load LDT if necessary
763 if (prev_p
->active_mm
->segments
!= next_p
->active_mm
->segments
)
764 load_LDT(next_p
->mm
);
766 /* Re-load page tables */
768 unsigned long new_cr3
= next
->cr3
;
771 if (new_cr3
!= prev
->cr3
)
772 asm volatile("movl %0,%%cr3": :"r" (new_cr3
));
776 * Restore %fs and %gs.
778 loadsegment(fs
, next
->fs
);
779 loadsegment(gs
, next
->gs
);
782 * Now maybe reload the debug registers
784 if (next
->debugreg
[7]){
794 if (prev
->ioperm
|| next
->ioperm
) {
797 * 4 cachelines copy ... not good, but not that
798 * bad either. Anyone got something better?
799 * This only affects processes which use ioperm().
800 * [Putting the TSSs into 4k-tlb mapped regions
801 * and playing VM tricks to switch the IO bitmap
802 * is not really acceptable.]
804 memcpy(tss
->io_bitmap
, next
->io_bitmap
,
805 IO_BITMAP_SIZE
*sizeof(unsigned long));
806 tss
->bitmap
= IO_BITMAP_OFFSET
;
809 * a bitmap offset pointing outside of the TSS limit
810 * causes a nicely controllable SIGSEGV if a process
811 * tries to use a port IO instruction. The first
812 * sys_ioperm() call sets up the bitmap properly.
814 tss
->bitmap
= INVALID_IO_BITMAP_OFFSET
;
818 asmlinkage
int sys_fork(struct pt_regs regs
)
820 return do_fork(SIGCHLD
, regs
.esp
, ®s
);
823 asmlinkage
int sys_clone(struct pt_regs regs
)
825 unsigned long clone_flags
;
828 clone_flags
= regs
.ebx
;
832 return do_fork(clone_flags
, newsp
, ®s
);
836 * This is trivial, and on the face of it looks like it
837 * could equally well be done in user mode.
839 * Not so, for quite unobvious reasons - register pressure.
840 * In user mode vfork() cannot have a stack frame, and if
841 * done by calling the "clone()" system call directly, you
842 * do not have enough call-clobbered registers to hold all
843 * the information you need.
845 asmlinkage
int sys_vfork(struct pt_regs regs
)
847 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
.esp
, ®s
);
851 * sys_execve() executes a new program.
853 asmlinkage
int sys_execve(struct pt_regs regs
)
859 filename
= getname((char *) regs
.ebx
);
860 error
= PTR_ERR(filename
);
861 if (IS_ERR(filename
))
863 error
= do_execve(filename
, (char **) regs
.ecx
, (char **) regs
.edx
, ®s
);
865 current
->flags
&= ~PF_DTRACE
;