- Peter Anvin: more P4 configuration parsing
[davej-history.git] / arch / i386 / kernel / process.c
blob8e7176c32447f8f03776b3262c6c3dc41c1363df
1 /*
2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
11 * This file handles the architecture-dependent parts of process handling..
14 #define __KERNEL_SYSCALLS__
15 #include <stdarg.h>
17 #include <linux/errno.h>
18 #include <linux/sched.h>
19 #include <linux/kernel.h>
20 #include <linux/mm.h>
21 #include <linux/smp.h>
22 #include <linux/smp_lock.h>
23 #include <linux/stddef.h>
24 #include <linux/unistd.h>
25 #include <linux/ptrace.h>
26 #include <linux/malloc.h>
27 #include <linux/vmalloc.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/config.h>
32 #include <linux/delay.h>
33 #include <linux/reboot.h>
34 #include <linux/init.h>
36 #include <asm/uaccess.h>
37 #include <asm/pgtable.h>
38 #include <asm/system.h>
39 #include <asm/io.h>
40 #include <asm/ldt.h>
41 #include <asm/processor.h>
42 #include <asm/i387.h>
43 #include <asm/desc.h>
44 #include <asm/mmu_context.h>
45 #ifdef CONFIG_MATH_EMULATION
46 #include <asm/math_emu.h>
47 #endif
49 #include <linux/irq.h>
51 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
53 int hlt_counter;
56 * Powermanagement idle function, if any..
58 void (*pm_idle)(void);
61 * Power off function, if any
63 void (*pm_power_off)(void);
65 void disable_hlt(void)
67 hlt_counter++;
70 void enable_hlt(void)
72 hlt_counter--;
76 * We use this if we don't have any better
77 * idle routine..
79 static void default_idle(void)
81 if (current_cpu_data.hlt_works_ok && !hlt_counter) {
82 __cli();
83 if (!current->need_resched)
84 safe_halt();
85 else
86 __sti();
91 * On SMP it's slightly faster (but much more power-consuming!)
92 * to poll the ->need_resched flag instead of waiting for the
93 * cross-CPU IPI to arrive. Use this option with caution.
95 static void poll_idle (void)
97 int oldval;
99 __sti();
102 * Deal with another CPU just having chosen a thread to
103 * run here:
105 oldval = xchg(&current->need_resched, -1);
107 if (!oldval)
108 asm volatile(
109 "2:"
110 "cmpl $-1, %0;"
111 "rep; nop;"
112 "je 2b;"
113 : :"m" (current->need_resched));
117 * The idle thread. There's no useful work to be
118 * done, so just try to conserve power and have a
119 * low exit latency (ie sit in a loop waiting for
120 * somebody to say that they'd like to reschedule)
122 void cpu_idle (void)
124 /* endless idle loop with no priority at all */
125 init_idle();
126 current->nice = 20;
127 current->counter = -100;
129 while (1) {
130 void (*idle)(void) = pm_idle;
131 if (!idle)
132 idle = default_idle;
133 while (!current->need_resched)
134 idle();
135 schedule();
136 check_pgt_cache();
140 static int __init idle_setup (char *str)
142 if (!strncmp(str, "poll", 4)) {
143 printk("using polling idle threads.\n");
144 pm_idle = poll_idle;
147 return 1;
150 __setup("idle=", idle_setup);
152 static long no_idt[2];
153 static int reboot_mode;
154 static int reboot_thru_bios;
156 static int __init reboot_setup(char *str)
158 while(1) {
159 switch (*str) {
160 case 'w': /* "warm" reboot (no memory testing etc) */
161 reboot_mode = 0x1234;
162 break;
163 case 'c': /* "cold" reboot (with memory testing etc) */
164 reboot_mode = 0x0;
165 break;
166 case 'b': /* "bios" reboot by jumping through the BIOS */
167 reboot_thru_bios = 1;
168 break;
169 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
170 reboot_thru_bios = 0;
171 break;
173 if((str = strchr(str,',')) != NULL)
174 str++;
175 else
176 break;
178 return 1;
181 __setup("reboot=", reboot_setup);
183 /* The following code and data reboots the machine by switching to real
184 mode and jumping to the BIOS reset entry point, as if the CPU has
185 really been reset. The previous version asked the keyboard
186 controller to pulse the CPU reset line, which is more thorough, but
187 doesn't work with at least one type of 486 motherboard. It is easy
188 to stop this code working; hence the copious comments. */
190 static unsigned long long
191 real_mode_gdt_entries [3] =
193 0x0000000000000000ULL, /* Null descriptor */
194 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
195 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
198 static struct
200 unsigned short size __attribute__ ((packed));
201 unsigned long long * base __attribute__ ((packed));
203 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
204 real_mode_idt = { 0x3ff, 0 };
206 /* This is 16-bit protected mode code to disable paging and the cache,
207 switch to real mode and jump to the BIOS reset code.
209 The instruction that switches to real mode by writing to CR0 must be
210 followed immediately by a far jump instruction, which set CS to a
211 valid value for real mode, and flushes the prefetch queue to avoid
212 running instructions that have already been decoded in protected
213 mode.
215 Clears all the flags except ET, especially PG (paging), PE
216 (protected-mode enable) and TS (task switch for coprocessor state
217 save). Flushes the TLB after paging has been disabled. Sets CD and
218 NW, to disable the cache on a 486, and invalidates the cache. This
219 is more like the state of a 486 after reset. I don't know if
220 something else should be done for other chips.
222 More could be done here to set up the registers as if a CPU reset had
223 occurred; hopefully real BIOSs don't assume much. */
225 static unsigned char real_mode_switch [] =
227 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
228 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
229 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
230 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
231 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
232 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
233 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
234 0x74, 0x02, /* jz f */
235 0x0f, 0x08, /* invd */
236 0x24, 0x10, /* f: andb $0x10,al */
237 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
239 static unsigned char jump_to_bios [] =
241 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
244 static inline void kb_wait(void)
246 int i;
248 for (i=0; i<0x10000; i++)
249 if ((inb_p(0x64) & 0x02) == 0)
250 break;
254 * Switch to real mode and then execute the code
255 * specified by the code and length parameters.
256 * We assume that length will aways be less that 100!
258 void machine_real_restart(unsigned char *code, int length)
260 cli();
262 /* Write zero to CMOS register number 0x0f, which the BIOS POST
263 routine will recognize as telling it to do a proper reboot. (Well
264 that's what this book in front of me says -- it may only apply to
265 the Phoenix BIOS though, it's not clear). At the same time,
266 disable NMIs by setting the top bit in the CMOS address register,
267 as we're about to do peculiar things to the CPU. I'm not sure if
268 `outb_p' is needed instead of just `outb'. Use it to be on the
269 safe side. */
271 outb_p (0x8f, 0x70);
272 outb_p (0x00, 0x71);
274 /* Remap the kernel at virtual address zero, as well as offset zero
275 from the kernel segment. This assumes the kernel segment starts at
276 virtual address PAGE_OFFSET. */
278 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
279 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
281 /* Make sure the first page is mapped to the start of physical memory.
282 It is normally not mapped, to trap kernel NULL pointer dereferences. */
284 pg0[0] = _PAGE_RW | _PAGE_PRESENT;
287 * Use `swapper_pg_dir' as our page directory.
289 asm volatile("movl %0,%%cr3": :"r" (__pa(swapper_pg_dir)));
291 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
292 this on booting to tell it to "Bypass memory test (also warm
293 boot)". This seems like a fairly standard thing that gets set by
294 REBOOT.COM programs, and the previous reset routine did this
295 too. */
297 *((unsigned short *)0x472) = reboot_mode;
299 /* For the switch to real mode, copy some code to low memory. It has
300 to be in the first 64k because it is running in 16-bit mode, and it
301 has to have the same physical and virtual address, because it turns
302 off paging. Copy it near the end of the first page, out of the way
303 of BIOS variables. */
305 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
306 real_mode_switch, sizeof (real_mode_switch));
307 memcpy ((void *) (0x1000 - 100), code, length);
309 /* Set up the IDT for real mode. */
311 __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
313 /* Set up a GDT from which we can load segment descriptors for real
314 mode. The GDT is not used in real mode; it is just needed here to
315 prepare the descriptors. */
317 __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
319 /* Load the data segment registers, and thus the descriptors ready for
320 real mode. The base address of each segment is 0x100, 16 times the
321 selector value being loaded here. This is so that the segment
322 registers don't have to be reloaded after switching to real mode:
323 the values are consistent for real mode operation already. */
325 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
326 "\tmovl %%eax,%%ds\n"
327 "\tmovl %%eax,%%es\n"
328 "\tmovl %%eax,%%fs\n"
329 "\tmovl %%eax,%%gs\n"
330 "\tmovl %%eax,%%ss" : : : "eax");
332 /* Jump to the 16-bit code that we copied earlier. It disables paging
333 and the cache, switches to real mode, and jumps to the BIOS reset
334 entry point. */
336 __asm__ __volatile__ ("ljmp $0x0008,%0"
338 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
341 void machine_restart(char * __unused)
343 #if CONFIG_SMP
345 * Stop all CPUs and turn off local APICs and the IO-APIC, so
346 * other OSs see a clean IRQ state.
348 smp_send_stop();
349 disable_IO_APIC();
350 #endif
352 if(!reboot_thru_bios) {
353 /* rebooting needs to touch the page at absolute addr 0 */
354 *((unsigned short *)__va(0x472)) = reboot_mode;
355 for (;;) {
356 int i;
357 for (i=0; i<100; i++) {
358 kb_wait();
359 udelay(50);
360 outb(0xfe,0x64); /* pulse reset low */
361 udelay(50);
363 /* That didn't work - force a triple fault.. */
364 __asm__ __volatile__("lidt %0": :"m" (no_idt));
365 __asm__ __volatile__("int3");
369 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
372 void machine_halt(void)
376 void machine_power_off(void)
378 if (pm_power_off)
379 pm_power_off();
383 void show_regs(struct pt_regs * regs)
385 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
387 printk("\n");
388 printk("EIP: %04x:[<%08lx>]",0xffff & regs->xcs,regs->eip);
389 if (regs->xcs & 3)
390 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
391 printk(" EFLAGS: %08lx\n",regs->eflags);
392 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
393 regs->eax,regs->ebx,regs->ecx,regs->edx);
394 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
395 regs->esi, regs->edi, regs->ebp);
396 printk(" DS: %04x ES: %04x\n",
397 0xffff & regs->xds,0xffff & regs->xes);
399 __asm__("movl %%cr0, %0": "=r" (cr0));
400 __asm__("movl %%cr2, %0": "=r" (cr2));
401 __asm__("movl %%cr3, %0": "=r" (cr3));
402 /* This could fault if %cr4 does not exist */
403 __asm__("1: movl %%cr4, %0 \n"
404 "2: \n"
405 ".section __ex_table,\"a\" \n"
406 ".long 1b,2b \n"
407 ".previous \n"
408 : "=r" (cr4): "0" (0));
409 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
413 * No need to lock the MM as we are the last user
415 void release_segments(struct mm_struct *mm)
417 void * ldt = mm->segments;
420 * free the LDT
422 if (ldt) {
423 mm->segments = NULL;
424 clear_LDT();
425 vfree(ldt);
430 * Create a kernel thread
432 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
434 long retval, d0;
436 __asm__ __volatile__(
437 "movl %%esp,%%esi\n\t"
438 "int $0x80\n\t" /* Linux/i386 system call */
439 "cmpl %%esp,%%esi\n\t" /* child or parent? */
440 "je 1f\n\t" /* parent - jump */
441 /* Load the argument into eax, and push it. That way, it does
442 * not matter whether the called function is compiled with
443 * -mregparm or not. */
444 "movl %4,%%eax\n\t"
445 "pushl %%eax\n\t"
446 "call *%5\n\t" /* call fn */
447 "movl %3,%0\n\t" /* exit */
448 "int $0x80\n"
449 "1:\t"
450 :"=&a" (retval), "=&S" (d0)
451 :"0" (__NR_clone), "i" (__NR_exit),
452 "r" (arg), "r" (fn),
453 "b" (flags | CLONE_VM)
454 : "memory");
455 return retval;
459 * Free current thread data structures etc..
461 void exit_thread(void)
463 /* nothing to do ... */
466 void flush_thread(void)
468 struct task_struct *tsk = current;
470 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
472 * Forget coprocessor state..
474 clear_fpu(tsk);
475 tsk->used_math = 0;
478 void release_thread(struct task_struct *dead_task)
480 if (dead_task->mm) {
481 void * ldt = dead_task->mm->segments;
483 // temporary debugging check
484 if (ldt) {
485 printk("WARNING: dead process %8s still has LDT? <%p>\n",
486 dead_task->comm, ldt);
487 BUG();
493 * we do not have to muck with descriptors here, that is
494 * done in switch_mm() as needed.
496 void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
498 struct mm_struct * old_mm = current->mm;
499 void * old_ldt = old_mm->segments, * ldt;
501 if (!old_ldt) {
503 * default LDT - use the one from init_task
505 new_mm->segments = NULL;
506 return;
510 * Completely new LDT, we initialize it from the parent:
512 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
513 if (!ldt)
514 printk(KERN_WARNING "ldt allocation failed\n");
515 else
516 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
517 new_mm->segments = ldt;
518 return;
522 * Save a segment.
524 #define savesegment(seg,value) \
525 asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
527 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
528 unsigned long unused,
529 struct task_struct * p, struct pt_regs * regs)
531 struct pt_regs * childregs;
533 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
534 struct_cpy(childregs, regs);
535 childregs->eax = 0;
536 childregs->esp = esp;
538 p->thread.esp = (unsigned long) childregs;
539 p->thread.esp0 = (unsigned long) (childregs+1);
541 p->thread.eip = (unsigned long) ret_from_fork;
543 savesegment(fs,p->thread.fs);
544 savesegment(gs,p->thread.gs);
546 unlazy_fpu(current);
547 struct_cpy(&p->thread.i387, &current->thread.i387);
549 return 0;
553 * fill in the user structure for a core dump..
555 void dump_thread(struct pt_regs * regs, struct user * dump)
557 int i;
559 /* changed the size calculations - should hopefully work better. lbt */
560 dump->magic = CMAGIC;
561 dump->start_code = 0;
562 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
563 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
564 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
565 dump->u_dsize -= dump->u_tsize;
566 dump->u_ssize = 0;
567 for (i = 0; i < 8; i++)
568 dump->u_debugreg[i] = current->thread.debugreg[i];
570 if (dump->start_stack < TASK_SIZE)
571 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
573 dump->regs.ebx = regs->ebx;
574 dump->regs.ecx = regs->ecx;
575 dump->regs.edx = regs->edx;
576 dump->regs.esi = regs->esi;
577 dump->regs.edi = regs->edi;
578 dump->regs.ebp = regs->ebp;
579 dump->regs.eax = regs->eax;
580 dump->regs.ds = regs->xds;
581 dump->regs.es = regs->xes;
582 savesegment(fs,dump->regs.fs);
583 savesegment(gs,dump->regs.gs);
584 dump->regs.orig_eax = regs->orig_eax;
585 dump->regs.eip = regs->eip;
586 dump->regs.cs = regs->xcs;
587 dump->regs.eflags = regs->eflags;
588 dump->regs.esp = regs->esp;
589 dump->regs.ss = regs->xss;
591 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
595 * This special macro can be used to load a debugging register
597 #define loaddebug(thread,register) \
598 __asm__("movl %0,%%db" #register \
599 : /* no output */ \
600 :"r" (thread->debugreg[register]))
603 * switch_to(x,yn) should switch tasks from x to y.
605 * We fsave/fwait so that an exception goes off at the right time
606 * (as a call from the fsave or fwait in effect) rather than to
607 * the wrong process. Lazy FP saving no longer makes any sense
608 * with modern CPU's, and this simplifies a lot of things (SMP
609 * and UP become the same).
611 * NOTE! We used to use the x86 hardware context switching. The
612 * reason for not using it any more becomes apparent when you
613 * try to recover gracefully from saved state that is no longer
614 * valid (stale segment register values in particular). With the
615 * hardware task-switch, there is no way to fix up bad state in
616 * a reasonable manner.
618 * The fact that Intel documents the hardware task-switching to
619 * be slow is a fairly red herring - this code is not noticeably
620 * faster. However, there _is_ some room for improvement here,
621 * so the performance issues may eventually be a valid point.
622 * More important, however, is the fact that this allows us much
623 * more flexibility.
625 void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 struct thread_struct *prev = &prev_p->thread,
628 *next = &next_p->thread;
629 struct tss_struct *tss = init_tss + smp_processor_id();
631 unlazy_fpu(prev_p);
634 * Reload esp0, LDT and the page table pointer:
636 tss->esp0 = next->esp0;
639 * Save away %fs and %gs. No need to save %es and %ds, as
640 * those are always kernel segments while inside the kernel.
642 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
643 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
646 * Restore %fs and %gs.
648 loadsegment(fs, next->fs);
649 loadsegment(gs, next->gs);
652 * Now maybe reload the debug registers
654 if (next->debugreg[7]){
655 loaddebug(next, 0);
656 loaddebug(next, 1);
657 loaddebug(next, 2);
658 loaddebug(next, 3);
659 /* no 4 and 5 */
660 loaddebug(next, 6);
661 loaddebug(next, 7);
664 if (prev->ioperm || next->ioperm) {
665 if (next->ioperm) {
667 * 4 cachelines copy ... not good, but not that
668 * bad either. Anyone got something better?
669 * This only affects processes which use ioperm().
670 * [Putting the TSSs into 4k-tlb mapped regions
671 * and playing VM tricks to switch the IO bitmap
672 * is not really acceptable.]
674 memcpy(tss->io_bitmap, next->io_bitmap,
675 IO_BITMAP_SIZE*sizeof(unsigned long));
676 tss->bitmap = IO_BITMAP_OFFSET;
677 } else
679 * a bitmap offset pointing outside of the TSS limit
680 * causes a nicely controllable SIGSEGV if a process
681 * tries to use a port IO instruction. The first
682 * sys_ioperm() call sets up the bitmap properly.
684 tss->bitmap = INVALID_IO_BITMAP_OFFSET;
688 asmlinkage int sys_fork(struct pt_regs regs)
690 return do_fork(SIGCHLD, regs.esp, &regs, 0);
693 asmlinkage int sys_clone(struct pt_regs regs)
695 unsigned long clone_flags;
696 unsigned long newsp;
698 clone_flags = regs.ebx;
699 newsp = regs.ecx;
700 if (!newsp)
701 newsp = regs.esp;
702 return do_fork(clone_flags, newsp, &regs, 0);
706 * This is trivial, and on the face of it looks like it
707 * could equally well be done in user mode.
709 * Not so, for quite unobvious reasons - register pressure.
710 * In user mode vfork() cannot have a stack frame, and if
711 * done by calling the "clone()" system call directly, you
712 * do not have enough call-clobbered registers to hold all
713 * the information you need.
715 asmlinkage int sys_vfork(struct pt_regs regs)
717 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0);
721 * sys_execve() executes a new program.
723 asmlinkage int sys_execve(struct pt_regs regs)
725 int error;
726 char * filename;
728 filename = getname((char *) regs.ebx);
729 error = PTR_ERR(filename);
730 if (IS_ERR(filename))
731 goto out;
732 error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, &regs);
733 if (error == 0)
734 current->ptrace &= ~PT_DTRACE;
735 putname(filename);
736 out:
737 return error;
741 * These bracket the sleeping functions..
743 extern void scheduling_functions_start_here(void);
744 extern void scheduling_functions_end_here(void);
745 #define first_sched ((unsigned long) scheduling_functions_start_here)
746 #define last_sched ((unsigned long) scheduling_functions_end_here)
748 unsigned long get_wchan(struct task_struct *p)
750 unsigned long ebp, esp, eip;
751 unsigned long stack_page;
752 int count = 0;
753 if (!p || p == current || p->state == TASK_RUNNING)
754 return 0;
755 stack_page = (unsigned long)p;
756 esp = p->thread.esp;
757 if (!stack_page || esp < stack_page || esp > 8188+stack_page)
758 return 0;
759 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
760 ebp = *(unsigned long *) esp;
761 do {
762 if (ebp < stack_page || ebp > 8184+stack_page)
763 return 0;
764 eip = *(unsigned long *) (ebp+4);
765 if (eip < first_sched || eip >= last_sched)
766 return eip;
767 ebp = *(unsigned long *) ebp;
768 } while (count++ < 16);
769 return 0;
771 #undef last_sched
772 #undef first_sched