2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
8 * This file handles the architecture-dependent parts of process handling..
11 #define __KERNEL_SYSCALLS__
14 #include <linux/errno.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/stddef.h>
21 #include <linux/unistd.h>
22 #include <linux/ptrace.h>
23 #include <linux/malloc.h>
24 #include <linux/vmalloc.h>
25 #include <linux/user.h>
26 #include <linux/a.out.h>
27 #include <linux/interrupt.h>
28 #include <linux/config.h>
29 #include <linux/unistd.h>
30 #include <linux/delay.h>
31 #include <linux/smp.h>
32 #include <linux/reboot.h>
33 #include <linux/init.h>
34 #if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF)
35 #include <linux/apm_bios.h>
38 #include <asm/uaccess.h>
39 #include <asm/pgtable.h>
40 #include <asm/system.h>
43 #include <asm/processor.h>
44 #ifdef CONFIG_MATH_EMULATION
45 #include <asm/math_emu.h>
49 spinlock_t semaphore_wake_lock
= SPIN_LOCK_UNLOCKED
;
52 asmlinkage
void ret_from_fork(void) __asm__("ret_from_smpfork");
54 asmlinkage
void ret_from_fork(void) __asm__("ret_from_sys_call");
58 extern int apm_do_idle(void);
59 extern void apm_do_busy(void);
62 static int hlt_counter
=0;
64 #define HARD_IDLE_TIMEOUT (HZ / 3)
66 void disable_hlt(void)
78 static void hard_idle(void)
80 while (!current
->need_resched
) {
81 if (boot_cpu_data
.hlt_works_ok
&& !hlt_counter
) {
83 /* If the APM BIOS is not enabled, or there
84 is an error calling the idle routine, we
85 should hlt if possible. We need to check
86 need_resched again because an interrupt
87 may have occurred in apm_do_idle(). */
89 if (!apm_do_idle() && !current
->need_resched
)
96 if (current
->need_resched
)
106 * The idle loop on a uniprocessor i386..
109 asmlinkage
int sys_idle(void)
111 unsigned long start_idle
= 0;
115 if (current
->pid
!= 0)
117 /* endless idle loop with no priority at all */
118 current
->priority
= 0;
119 current
->counter
= 0;
122 * We are locked at this point. So we can safely call
123 * the APM bios knowing only one CPU at a time will do
128 start_idle
= jiffies
;
130 if (jiffies
- start_idle
> HARD_IDLE_TIMEOUT
)
133 if (boot_cpu_data
.hlt_works_ok
&& !hlt_counter
&& !current
->need_resched
)
136 run_task_queue(&tq_scheduler
);
137 if (current
->need_resched
)
150 * This is being executed in task 0 'user space'.
153 int cpu_idle(void *unused
)
155 current
->priority
= 0;
158 if(current_cpu_data
.hlt_works_ok
&&
159 !hlt_counter
&& !current
->need_resched
)
162 run_task_queue(&tq_scheduler
);
164 /* endless idle loop with no priority at all */
165 current
->counter
= 0;
170 asmlinkage
int sys_idle(void)
172 if (current
->pid
!= 0)
181 * This routine reboots the machine by asking the keyboard
182 * controller to pulse the reset-line low. We try that for a while,
183 * and if it doesn't work, we do some other stupid things.
186 static long no_idt
[2] = {0, 0};
187 static int reboot_mode
= 0;
188 static int reboot_thru_bios
= 0;
190 __initfunc(void reboot_setup(char *str
, int *ints
))
194 case 'w': /* "warm" reboot (no memory testing etc) */
195 reboot_mode
= 0x1234;
197 case 'c': /* "cold" reboot (with memory testing etc) */
200 case 'b': /* "bios" reboot by jumping through the BIOS */
201 reboot_thru_bios
= 1;
203 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
204 reboot_thru_bios
= 0;
207 if((str
= strchr(str
,',')) != NULL
)
215 /* The following code and data reboots the machine by switching to real
216 mode and jumping to the BIOS reset entry point, as if the CPU has
217 really been reset. The previous version asked the keyboard
218 controller to pulse the CPU reset line, which is more thorough, but
219 doesn't work with at least one type of 486 motherboard. It is easy
220 to stop this code working; hence the copious comments. */
222 static unsigned long long
223 real_mode_gdt_entries
[3] =
225 0x0000000000000000ULL
, /* Null descriptor */
226 0x00009a000000ffffULL
, /* 16-bit real-mode 64k code at 0x00000000 */
227 0x000092000100ffffULL
/* 16-bit real-mode 64k data at 0x00000100 */
232 unsigned short size
__attribute__ ((packed
));
233 unsigned long long * base
__attribute__ ((packed
));
235 real_mode_gdt
= { sizeof (real_mode_gdt_entries
) - 1, real_mode_gdt_entries
},
236 real_mode_idt
= { 0x3ff, 0 };
238 /* This is 16-bit protected mode code to disable paging and the cache,
239 switch to real mode and jump to the BIOS reset code.
241 The instruction that switches to real mode by writing to CR0 must be
242 followed immediately by a far jump instruction, which set CS to a
243 valid value for real mode, and flushes the prefetch queue to avoid
244 running instructions that have already been decoded in protected
247 Clears all the flags except ET, especially PG (paging), PE
248 (protected-mode enable) and TS (task switch for coprocessor state
249 save). Flushes the TLB after paging has been disabled. Sets CD and
250 NW, to disable the cache on a 486, and invalidates the cache. This
251 is more like the state of a 486 after reset. I don't know if
252 something else should be done for other chips.
254 More could be done here to set up the registers as if a CPU reset had
255 occurred; hopefully real BIOSs don't assume much. */
257 static unsigned char real_mode_switch
[] =
259 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
260 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
261 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
262 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
263 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
264 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
265 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
266 0x74, 0x02, /* jz f */
267 0x0f, 0x08, /* invd */
268 0x24, 0x10, /* f: andb $0x10,al */
269 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
270 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
273 static inline void kb_wait(void)
277 for (i
=0; i
<0x10000; i
++)
278 if ((inb_p(0x64) & 0x02) == 0)
282 void machine_restart(char * __unused
)
286 * turn off the IO-APIC, so we can do a clean reboot
291 if(!reboot_thru_bios
) {
292 /* rebooting needs to touch the page at absolute addr 0 */
293 *((unsigned short *)__va(0x472)) = reboot_mode
;
296 for (i
=0; i
<100; i
++) {
299 outb(0xfe,0x64); /* pulse reset low */
302 /* That didn't work - force a triple fault.. */
303 __asm__
__volatile__("lidt %0": :"m" (no_idt
));
304 __asm__
__volatile__("int3");
310 /* Write zero to CMOS register number 0x0f, which the BIOS POST
311 routine will recognize as telling it to do a proper reboot. (Well
312 that's what this book in front of me says -- it may only apply to
313 the Phoenix BIOS though, it's not clear). At the same time,
314 disable NMIs by setting the top bit in the CMOS address register,
315 as we're about to do peculiar things to the CPU. I'm not sure if
316 `outb_p' is needed instead of just `outb'. Use it to be on the
322 /* Remap the kernel at virtual address zero, as well as offset zero
323 from the kernel segment. This assumes the kernel segment starts at
324 virtual address PAGE_OFFSET. */
326 memcpy (swapper_pg_dir
, swapper_pg_dir
+ USER_PGD_PTRS
,
327 sizeof (swapper_pg_dir
[0]) * KERNEL_PGD_PTRS
);
329 /* Make sure the first page is mapped to the start of physical memory.
330 It is normally not mapped, to trap kernel NULL pointer dereferences. */
335 * Use `swapper_pg_dir' as our page directory. We bother with
336 * `SET_PAGE_DIR' because although might be rebooting, but if we change
337 * the way we set root page dir in the future, then we wont break a
338 * seldom used feature ;)
341 SET_PAGE_DIR(current
,swapper_pg_dir
);
343 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
344 this on booting to tell it to "Bypass memory test (also warm
345 boot)". This seems like a fairly standard thing that gets set by
346 REBOOT.COM programs, and the previous reset routine did this
349 *((unsigned short *)0x472) = reboot_mode
;
351 /* For the switch to real mode, copy some code to low memory. It has
352 to be in the first 64k because it is running in 16-bit mode, and it
353 has to have the same physical and virtual address, because it turns
354 off paging. Copy it near the end of the first page, out of the way
355 of BIOS variables. */
357 memcpy ((void *) (0x1000 - sizeof (real_mode_switch
)),
358 real_mode_switch
, sizeof (real_mode_switch
));
360 /* Set up the IDT for real mode. */
362 __asm__
__volatile__ ("lidt %0" : : "m" (real_mode_idt
));
364 /* Set up a GDT from which we can load segment descriptors for real
365 mode. The GDT is not used in real mode; it is just needed here to
366 prepare the descriptors. */
368 __asm__
__volatile__ ("lgdt %0" : : "m" (real_mode_gdt
));
370 /* Load the data segment registers, and thus the descriptors ready for
371 real mode. The base address of each segment is 0x100, 16 times the
372 selector value being loaded here. This is so that the segment
373 registers don't have to be reloaded after switching to real mode:
374 the values are consistent for real mode operation already. */
376 __asm__
__volatile__ ("movl $0x0010,%%eax\n"
381 "\tmovl %%ax,%%ss" : : : "eax");
383 /* Jump to the 16-bit code that we copied earlier. It disables paging
384 and the cache, switches to real mode, and jumps to the BIOS reset
387 __asm__
__volatile__ ("ljmp $0x0008,%0"
389 : "i" ((void *) (0x1000 - sizeof (real_mode_switch
))));
392 void machine_halt(void)
396 void machine_power_off(void)
398 #if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF)
399 apm_set_power_state(APM_STATE_OFF
);
404 void show_regs(struct pt_regs
* regs
)
406 long cr0
= 0L, cr2
= 0L, cr3
= 0L;
409 printk("EIP: %04x:[<%08lx>]",0xffff & regs
->xcs
,regs
->eip
);
411 printk(" ESP: %04x:%08lx",0xffff & regs
->xss
,regs
->esp
);
412 printk(" EFLAGS: %08lx\n",regs
->eflags
);
413 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
414 regs
->eax
,regs
->ebx
,regs
->ecx
,regs
->edx
);
415 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
416 regs
->esi
, regs
->edi
, regs
->ebp
);
417 printk(" DS: %04x ES: %04x\n",
418 0xffff & regs
->xds
,0xffff & regs
->xes
);
419 __asm__("movl %%cr0, %0": "=r" (cr0
));
420 __asm__("movl %%cr2, %0": "=r" (cr2
));
421 __asm__("movl %%cr3, %0": "=r" (cr3
));
422 printk("CR0: %08lx CR2: %08lx CR3: %08lx\n", cr0
, cr2
, cr3
);
426 * Allocation and freeing of basic task resources.
428 * NOTE! The task struct and the stack go together
430 * The task structure is a two-page thing, and as such
431 * not reliable to allocate using the basic page alloc
432 * functions. We have a small cache of structures for
433 * when the allocations fail..
435 * This extra buffer essentially acts to make for less
436 * "jitter" in the allocations..
438 * On SMP we don't do this right now because:
439 * - we aren't holding any locks when called, and we might
440 * as well just depend on the generic memory management
441 * to do proper locking for us instead of complicating it
443 * - if you use SMP you have a beefy enough machine that
444 * this shouldn't matter..
447 #define EXTRA_TASK_STRUCT 16
448 static struct task_struct
* task_struct_stack
[EXTRA_TASK_STRUCT
];
449 static int task_struct_stack_ptr
= -1;
452 struct task_struct
* alloc_task_struct(void)
454 #ifndef EXTRA_TASK_STRUCT
455 return (struct task_struct
*) __get_free_pages(GFP_KERNEL
,1);
458 struct task_struct
*ret
;
460 index
= task_struct_stack_ptr
;
461 if (index
>= EXTRA_TASK_STRUCT
/2)
463 ret
= (struct task_struct
*) __get_free_pages(GFP_KERNEL
,1);
465 index
= task_struct_stack_ptr
;
468 ret
= task_struct_stack
[index
];
469 task_struct_stack_ptr
= index
-1;
476 void free_task_struct(struct task_struct
*p
)
478 #ifdef EXTRA_TASK_STRUCT
479 int index
= task_struct_stack_ptr
+1;
481 if (index
< EXTRA_TASK_STRUCT
) {
482 task_struct_stack
[index
] = p
;
483 task_struct_stack_ptr
= index
;
486 free_pages((unsigned long) p
, 1);
489 void release_segments(struct mm_struct
*mm
)
491 void * ldt
= mm
->segments
;
494 /* forget local segments */
495 __asm__
__volatile__("movl %w0,%%fs ; movl %w0,%%gs ; lldt %w0"
498 current
->tss
.ldt
= 0;
500 * Set the GDT entry back to the default.
502 nr
= current
->tarray_ptr
- &task
[0];
503 set_ldt_desc(gdt
+(nr
<<1)+FIRST_LDT_ENTRY
, &default_ldt
, 1);
512 * Free current thread data structures etc..
514 void exit_thread(void)
516 /* nothing to do ... */
519 void flush_thread(void)
523 for (i
=0 ; i
<8 ; i
++)
524 current
->tss
.debugreg
[i
] = 0;
527 * Forget coprocessor state..
529 if (current
->flags
& PF_USEDFPU
) {
530 current
->flags
&= ~PF_USEDFPU
;
533 current
->used_math
= 0;
536 void release_thread(struct task_struct
*dead_task
)
540 static inline void unlazy_fpu(struct task_struct
*tsk
)
542 if (tsk
->flags
& PF_USEDFPU
) {
543 tsk
->flags
&= ~PF_USEDFPU
;
544 __asm__("fnsave %0":"=m" (tsk
->tss
.i387
));
546 asm volatile("fwait");
551 * If new_mm is NULL, we're being called to set up the LDT descriptor
552 * for a clone task. Each clone must have a separate entry in the GDT.
554 void copy_segments(int nr
, struct task_struct
*p
, struct mm_struct
*new_mm
)
556 struct mm_struct
* old_mm
= current
->mm
;
557 void * old_ldt
= old_mm
->segments
, * ldt
= old_ldt
;
558 int ldt_size
= LDT_ENTRIES
;
560 p
->tss
.ldt
= _LDT(nr
);
563 ldt
= vmalloc(LDT_ENTRIES
*LDT_ENTRY_SIZE
);
564 new_mm
->segments
= ldt
;
566 printk(KERN_WARNING
"ldt allocation failed\n");
569 memcpy(ldt
, old_ldt
, LDT_ENTRIES
*LDT_ENTRY_SIZE
);
576 set_ldt_desc(gdt
+(nr
<<1)+FIRST_LDT_ENTRY
, ldt
, ldt_size
);
582 #define savesegment(seg,value) \
583 asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
585 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long esp
,
586 struct task_struct
* p
, struct pt_regs
* regs
)
588 struct pt_regs
* childregs
;
590 childregs
= ((struct pt_regs
*) (2*PAGE_SIZE
+ (unsigned long) p
)) - 1;
593 childregs
->esp
= esp
;
594 childregs
->eflags
= regs
->eflags
& 0xffffcfff; /* iopl always 0 for a new process */
596 p
->tss
.esp
= (unsigned long) childregs
;
597 p
->tss
.esp0
= (unsigned long) (childregs
+1);
598 p
->tss
.ss0
= __KERNEL_DS
;
600 p
->tss
.tr
= _TSS(nr
);
601 set_tss_desc(gdt
+(nr
<<1)+FIRST_TSS_ENTRY
,&(p
->tss
));
602 p
->tss
.eip
= (unsigned long) ret_from_fork
;
604 savesegment(fs
,p
->tss
.fs
);
605 savesegment(gs
,p
->tss
.gs
);
608 * a bitmap offset pointing outside of the TSS limit causes a nicely
609 * controllable SIGSEGV. The first sys_ioperm() call sets up the
612 p
->tss
.bitmap
= sizeof(struct thread_struct
);
615 p
->tss
.i387
= current
->tss
.i387
;
621 * fill in the FPU structure for a core dump.
623 int dump_fpu (struct pt_regs
* regs
, struct user_i387_struct
* fpu
)
627 fpvalid
= current
->used_math
;
630 memcpy(fpu
,¤t
->tss
.i387
.hard
,sizeof(*fpu
));
637 * fill in the user structure for a core dump..
639 void dump_thread(struct pt_regs
* regs
, struct user
* dump
)
643 /* changed the size calculations - should hopefully work better. lbt */
644 dump
->magic
= CMAGIC
;
645 dump
->start_code
= 0;
646 dump
->start_stack
= regs
->esp
& ~(PAGE_SIZE
- 1);
647 dump
->u_tsize
= ((unsigned long) current
->mm
->end_code
) >> PAGE_SHIFT
;
648 dump
->u_dsize
= ((unsigned long) (current
->mm
->brk
+ (PAGE_SIZE
-1))) >> PAGE_SHIFT
;
649 dump
->u_dsize
-= dump
->u_tsize
;
651 for (i
= 0; i
< 8; i
++)
652 dump
->u_debugreg
[i
] = current
->tss
.debugreg
[i
];
654 if (dump
->start_stack
< TASK_SIZE
)
655 dump
->u_ssize
= ((unsigned long) (TASK_SIZE
- dump
->start_stack
)) >> PAGE_SHIFT
;
657 dump
->regs
.ebx
= regs
->ebx
;
658 dump
->regs
.ecx
= regs
->ecx
;
659 dump
->regs
.edx
= regs
->edx
;
660 dump
->regs
.esi
= regs
->esi
;
661 dump
->regs
.edi
= regs
->edi
;
662 dump
->regs
.ebp
= regs
->ebp
;
663 dump
->regs
.eax
= regs
->eax
;
664 dump
->regs
.ds
= regs
->xds
;
665 dump
->regs
.es
= regs
->xes
;
666 savesegment(fs
,dump
->regs
.fs
);
667 savesegment(gs
,dump
->regs
.gs
);
668 dump
->regs
.orig_eax
= regs
->orig_eax
;
669 dump
->regs
.eip
= regs
->eip
;
670 dump
->regs
.cs
= regs
->xcs
;
671 dump
->regs
.eflags
= regs
->eflags
;
672 dump
->regs
.esp
= regs
->esp
;
673 dump
->regs
.ss
= regs
->xss
;
675 dump
->u_fpvalid
= dump_fpu (regs
, &dump
->i387
);
679 * This special macro can be used to load a debugging register
681 #define loaddebug(tsk,register) \
682 __asm__("movl %0,%%db" #register \
684 :"r" (tsk->tss.debugreg[register]))
688 * switch_to(x,yn) should switch tasks from x to y.
690 * We fsave/fwait so that an exception goes off at the right time
691 * (as a call from the fsave or fwait in effect) rather than to
692 * the wrong process. Lazy FP saving no longer makes any sense
693 * with modern CPU's, and this simplifies a lot of things (SMP
694 * and UP become the same).
696 * NOTE! We used to use the x86 hardware context switching. The
697 * reason for not using it any more becomes apparent when you
698 * try to recover gracefully from saved state that is no longer
699 * valid (stale segment register values in particular). With the
700 * hardware task-switch, there is no way to fix up bad state in
701 * a reasonable manner.
703 * The fact that Intel documents the hardware task-switching to
704 * be slow is a fairly red herring - this code is not noticeably
705 * faster. However, there _is_ some room for improvement here,
706 * so the performance issues may eventually be a valid point.
707 * More important, however, is the fact that this allows us much
710 void __switch_to(struct task_struct
*prev
, struct task_struct
*next
)
712 /* Do the FPU save and set TS if it wasn't set before.. */
716 * Reload TR, LDT and the page table pointers..
718 * We need TR for the IO permission bitmask (and
719 * the vm86 bitmasks in case we ever use enhanced
720 * v86 mode properly).
722 * We may want to get rid of the TR register some
723 * day, and copy the bitmaps around by hand. Oh,
724 * well. In the meantime we have to clear the busy
725 * bit in the TSS entry, ugh.
727 gdt_table
[next
->tss
.tr
>> 3].b
&= 0xfffffdff;
728 asm volatile("ltr %0": :"g" (*(unsigned short *)&next
->tss
.tr
));
730 /* Re-load LDT if necessary */
731 if (next
->mm
->segments
!= prev
->mm
->segments
)
732 asm volatile("lldt %0": :"g" (*(unsigned short *)&next
->tss
.ldt
));
734 /* Re-load page tables */
735 if (next
->tss
.cr3
!= prev
->tss
.cr3
)
736 asm volatile("movl %0,%%cr3": :"r" (next
->tss
.cr3
));
739 * Save away %fs and %gs. No need to save %es and %ds, as
740 * those are always kernel segments while inside the kernel.
741 * Restore the new values.
743 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev
->tss
.fs
));
744 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev
->tss
.gs
));
746 loadsegment(fs
,next
->tss
.fs
);
747 loadsegment(gs
,next
->tss
.gs
);
750 * Now maybe reload the debug registers
752 if (next
->tss
.debugreg
[7]){
762 asmlinkage
int sys_fork(struct pt_regs regs
)
767 ret
= do_fork(SIGCHLD
, regs
.esp
, ®s
);
772 asmlinkage
int sys_clone(struct pt_regs regs
)
774 unsigned long clone_flags
;
779 clone_flags
= regs
.ebx
;
783 ret
= do_fork(clone_flags
, newsp
, ®s
);
789 * sys_execve() executes a new program.
791 asmlinkage
int sys_execve(struct pt_regs regs
)
797 filename
= getname((char *) regs
.ebx
);
798 error
= PTR_ERR(filename
);
799 if (IS_ERR(filename
))
801 error
= do_execve(filename
, (char **) regs
.ecx
, (char **) regs
.edx
, ®s
);