- Kai Germaschewski: ISDN update (including Makefiles)
[davej-history.git] / arch / i386 / kernel / process.c
blob686821205500cdaa4a0b1d89129cdd3317e95e1f
1 /*
2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
11 * This file handles the architecture-dependent parts of process handling..
14 #define __KERNEL_SYSCALLS__
15 #include <stdarg.h>
17 #include <linux/errno.h>
18 #include <linux/sched.h>
19 #include <linux/kernel.h>
20 #include <linux/mm.h>
21 #include <linux/smp.h>
22 #include <linux/smp_lock.h>
23 #include <linux/stddef.h>
24 #include <linux/unistd.h>
25 #include <linux/ptrace.h>
26 #include <linux/malloc.h>
27 #include <linux/vmalloc.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/config.h>
32 #include <linux/delay.h>
33 #include <linux/reboot.h>
34 #include <linux/init.h>
36 #include <asm/uaccess.h>
37 #include <asm/pgtable.h>
38 #include <asm/system.h>
39 #include <asm/io.h>
40 #include <asm/ldt.h>
41 #include <asm/processor.h>
42 #include <asm/i387.h>
43 #include <asm/desc.h>
44 #include <asm/mmu_context.h>
45 #ifdef CONFIG_MATH_EMULATION
46 #include <asm/math_emu.h>
47 #endif
49 #include <linux/irq.h>
51 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
53 int hlt_counter;
56 * Powermanagement idle function, if any..
58 void (*pm_idle)(void);
61 * Power off function, if any
63 void (*pm_power_off)(void);
65 void disable_hlt(void)
67 hlt_counter++;
70 void enable_hlt(void)
72 hlt_counter--;
76 * We use this if we don't have any better
77 * idle routine..
79 static void default_idle(void)
81 if (current_cpu_data.hlt_works_ok && !hlt_counter) {
82 __cli();
83 if (!current->need_resched)
84 safe_halt();
85 else
86 __sti();
91 * On SMP it's slightly faster (but much more power-consuming!)
92 * to poll the ->need_resched flag instead of waiting for the
93 * cross-CPU IPI to arrive. Use this option with caution.
95 static void poll_idle (void)
97 int oldval;
99 __sti();
102 * Deal with another CPU just having chosen a thread to
103 * run here:
105 oldval = xchg(&current->need_resched, -1);
107 if (!oldval)
108 asm volatile(
109 "2:"
110 "cmpl $-1, %0;"
111 "rep; nop;"
112 "je 2b;"
113 : :"m" (current->need_resched));
117 * The idle thread. There's no useful work to be
118 * done, so just try to conserve power and have a
119 * low exit latency (ie sit in a loop waiting for
120 * somebody to say that they'd like to reschedule)
122 void cpu_idle (void)
124 /* endless idle loop with no priority at all */
125 init_idle();
126 current->nice = 20;
127 current->counter = -100;
129 while (1) {
130 void (*idle)(void) = pm_idle;
131 if (!idle)
132 idle = default_idle;
133 while (!current->need_resched)
134 idle();
135 schedule();
136 check_pgt_cache();
140 static int __init idle_setup (char *str)
142 if (!strncmp(str, "poll", 4)) {
143 printk("using polling idle threads.\n");
144 pm_idle = poll_idle;
147 return 1;
150 __setup("idle=", idle_setup);
152 static long no_idt[2];
153 static int reboot_mode;
154 static int reboot_thru_bios;
156 static int __init reboot_setup(char *str)
158 while(1) {
159 switch (*str) {
160 case 'w': /* "warm" reboot (no memory testing etc) */
161 reboot_mode = 0x1234;
162 break;
163 case 'c': /* "cold" reboot (with memory testing etc) */
164 reboot_mode = 0x0;
165 break;
166 case 'b': /* "bios" reboot by jumping through the BIOS */
167 reboot_thru_bios = 1;
168 break;
169 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
170 reboot_thru_bios = 0;
171 break;
173 if((str = strchr(str,',')) != NULL)
174 str++;
175 else
176 break;
178 return 1;
181 __setup("reboot=", reboot_setup);
183 /* The following code and data reboots the machine by switching to real
184 mode and jumping to the BIOS reset entry point, as if the CPU has
185 really been reset. The previous version asked the keyboard
186 controller to pulse the CPU reset line, which is more thorough, but
187 doesn't work with at least one type of 486 motherboard. It is easy
188 to stop this code working; hence the copious comments. */
190 static unsigned long long
191 real_mode_gdt_entries [3] =
193 0x0000000000000000ULL, /* Null descriptor */
194 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
195 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
198 static struct
200 unsigned short size __attribute__ ((packed));
201 unsigned long long * base __attribute__ ((packed));
203 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
204 real_mode_idt = { 0x3ff, 0 };
206 /* This is 16-bit protected mode code to disable paging and the cache,
207 switch to real mode and jump to the BIOS reset code.
209 The instruction that switches to real mode by writing to CR0 must be
210 followed immediately by a far jump instruction, which set CS to a
211 valid value for real mode, and flushes the prefetch queue to avoid
212 running instructions that have already been decoded in protected
213 mode.
215 Clears all the flags except ET, especially PG (paging), PE
216 (protected-mode enable) and TS (task switch for coprocessor state
217 save). Flushes the TLB after paging has been disabled. Sets CD and
218 NW, to disable the cache on a 486, and invalidates the cache. This
219 is more like the state of a 486 after reset. I don't know if
220 something else should be done for other chips.
222 More could be done here to set up the registers as if a CPU reset had
223 occurred; hopefully real BIOSs don't assume much. */
225 static unsigned char real_mode_switch [] =
227 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
228 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
229 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
230 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
231 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
232 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
233 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
234 0x74, 0x02, /* jz f */
235 0x0f, 0x08, /* invd */
236 0x24, 0x10, /* f: andb $0x10,al */
237 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
239 static unsigned char jump_to_bios [] =
241 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
244 static inline void kb_wait(void)
246 int i;
248 for (i=0; i<0x10000; i++)
249 if ((inb_p(0x64) & 0x02) == 0)
250 break;
254 * Switch to real mode and then execute the code
255 * specified by the code and length parameters.
256 * We assume that length will aways be less that 100!
258 void machine_real_restart(unsigned char *code, int length)
260 cli();
262 /* Write zero to CMOS register number 0x0f, which the BIOS POST
263 routine will recognize as telling it to do a proper reboot. (Well
264 that's what this book in front of me says -- it may only apply to
265 the Phoenix BIOS though, it's not clear). At the same time,
266 disable NMIs by setting the top bit in the CMOS address register,
267 as we're about to do peculiar things to the CPU. I'm not sure if
268 `outb_p' is needed instead of just `outb'. Use it to be on the
269 safe side. */
271 outb_p (0x8f, 0x70);
272 outb_p (0x00, 0x71);
274 /* Remap the kernel at virtual address zero, as well as offset zero
275 from the kernel segment. This assumes the kernel segment starts at
276 virtual address PAGE_OFFSET. */
278 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
279 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
281 /* Make sure the first page is mapped to the start of physical memory.
282 It is normally not mapped, to trap kernel NULL pointer dereferences. */
284 pg0[0] = _PAGE_RW | _PAGE_PRESENT;
287 * Use `swapper_pg_dir' as our page directory.
289 asm volatile("movl %0,%%cr3": :"r" (__pa(swapper_pg_dir)));
291 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
292 this on booting to tell it to "Bypass memory test (also warm
293 boot)". This seems like a fairly standard thing that gets set by
294 REBOOT.COM programs, and the previous reset routine did this
295 too. */
297 *((unsigned short *)0x472) = reboot_mode;
299 /* For the switch to real mode, copy some code to low memory. It has
300 to be in the first 64k because it is running in 16-bit mode, and it
301 has to have the same physical and virtual address, because it turns
302 off paging. Copy it near the end of the first page, out of the way
303 of BIOS variables. */
305 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
306 real_mode_switch, sizeof (real_mode_switch));
307 memcpy ((void *) (0x1000 - 100), code, length);
309 /* Set up the IDT for real mode. */
311 __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
313 /* Set up a GDT from which we can load segment descriptors for real
314 mode. The GDT is not used in real mode; it is just needed here to
315 prepare the descriptors. */
317 __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
319 /* Load the data segment registers, and thus the descriptors ready for
320 real mode. The base address of each segment is 0x100, 16 times the
321 selector value being loaded here. This is so that the segment
322 registers don't have to be reloaded after switching to real mode:
323 the values are consistent for real mode operation already. */
325 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
326 "\tmovl %%eax,%%ds\n"
327 "\tmovl %%eax,%%es\n"
328 "\tmovl %%eax,%%fs\n"
329 "\tmovl %%eax,%%gs\n"
330 "\tmovl %%eax,%%ss" : : : "eax");
332 /* Jump to the 16-bit code that we copied earlier. It disables paging
333 and the cache, switches to real mode, and jumps to the BIOS reset
334 entry point. */
336 __asm__ __volatile__ ("ljmp $0x0008,%0"
338 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
341 void machine_restart(char * __unused)
343 #if CONFIG_SMP
345 * Stop all CPUs and turn off local APICs and the IO-APIC, so
346 * other OSs see a clean IRQ state.
348 smp_send_stop();
349 disable_IO_APIC();
350 #endif
352 if(!reboot_thru_bios) {
353 /* rebooting needs to touch the page at absolute addr 0 */
354 *((unsigned short *)__va(0x472)) = reboot_mode;
355 for (;;) {
356 int i;
357 for (i=0; i<100; i++) {
358 kb_wait();
359 udelay(50);
360 outb(0xfe,0x64); /* pulse reset low */
361 udelay(50);
363 /* That didn't work - force a triple fault.. */
364 __asm__ __volatile__("lidt %0": :"m" (no_idt));
365 __asm__ __volatile__("int3");
369 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
372 void machine_halt(void)
376 void machine_power_off(void)
378 if (pm_power_off)
379 pm_power_off();
383 void show_regs(struct pt_regs * regs)
385 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
387 printk("\n");
388 printk("EIP: %04x:[<%08lx>]",0xffff & regs->xcs,regs->eip);
389 if (regs->xcs & 3)
390 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
391 printk(" EFLAGS: %08lx\n",regs->eflags);
392 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
393 regs->eax,regs->ebx,regs->ecx,regs->edx);
394 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
395 regs->esi, regs->edi, regs->ebp);
396 printk(" DS: %04x ES: %04x\n",
397 0xffff & regs->xds,0xffff & regs->xes);
399 __asm__("movl %%cr0, %0": "=r" (cr0));
400 __asm__("movl %%cr2, %0": "=r" (cr2));
401 __asm__("movl %%cr3, %0": "=r" (cr3));
402 /* This could fault if %cr4 does not exist */
403 __asm__("1: movl %%cr4, %0 \n"
404 "2: \n"
405 ".section __ex_table,\"a\" \n"
406 ".long 1b,2b \n"
407 ".previous \n"
408 : "=r" (cr4): "0" (0));
409 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
413 * No need to lock the MM as we are the last user
415 void destroy_context(struct mm_struct *mm)
417 void * ldt = mm->context.segments;
420 * free the LDT
422 if (ldt) {
423 mm->context.segments = NULL;
424 clear_LDT();
425 vfree(ldt);
430 * Create a kernel thread
432 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
434 long retval, d0;
436 __asm__ __volatile__(
437 "movl %%esp,%%esi\n\t"
438 "int $0x80\n\t" /* Linux/i386 system call */
439 "cmpl %%esp,%%esi\n\t" /* child or parent? */
440 "je 1f\n\t" /* parent - jump */
441 /* Load the argument into eax, and push it. That way, it does
442 * not matter whether the called function is compiled with
443 * -mregparm or not. */
444 "movl %4,%%eax\n\t"
445 "pushl %%eax\n\t"
446 "call *%5\n\t" /* call fn */
447 "movl %3,%0\n\t" /* exit */
448 "int $0x80\n"
449 "1:\t"
450 :"=&a" (retval), "=&S" (d0)
451 :"0" (__NR_clone), "i" (__NR_exit),
452 "r" (arg), "r" (fn),
453 "b" (flags | CLONE_VM)
454 : "memory");
455 return retval;
459 * Free current thread data structures etc..
461 void exit_thread(void)
463 /* nothing to do ... */
466 void flush_thread(void)
468 struct task_struct *tsk = current;
470 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
472 * Forget coprocessor state..
474 clear_fpu(tsk);
475 tsk->used_math = 0;
478 void release_thread(struct task_struct *dead_task)
480 if (dead_task->mm) {
481 void * ldt = dead_task->mm->context.segments;
483 // temporary debugging check
484 if (ldt) {
485 printk("WARNING: dead process %8s still has LDT? <%p>\n",
486 dead_task->comm, ldt);
487 BUG();
493 * we do not have to muck with descriptors here, that is
494 * done in switch_mm() as needed.
496 int init_new_context(struct task_struct *p, struct mm_struct *new_mm)
498 struct mm_struct * old_mm;
499 void *old_ldt, *ldt;
501 ldt = NULL;
502 old_mm = current->mm;
503 if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
505 * Completely new LDT, we initialize it from the parent:
507 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
508 if (!ldt)
509 return -ENOMEM;
510 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
512 new_mm->context.segments = ldt;
513 return 0;
517 * Save a segment.
519 #define savesegment(seg,value) \
520 asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
522 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
523 unsigned long unused,
524 struct task_struct * p, struct pt_regs * regs)
526 struct pt_regs * childregs;
528 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
529 struct_cpy(childregs, regs);
530 childregs->eax = 0;
531 childregs->esp = esp;
533 p->thread.esp = (unsigned long) childregs;
534 p->thread.esp0 = (unsigned long) (childregs+1);
536 p->thread.eip = (unsigned long) ret_from_fork;
538 savesegment(fs,p->thread.fs);
539 savesegment(gs,p->thread.gs);
541 unlazy_fpu(current);
542 struct_cpy(&p->thread.i387, &current->thread.i387);
544 return 0;
548 * fill in the user structure for a core dump..
550 void dump_thread(struct pt_regs * regs, struct user * dump)
552 int i;
554 /* changed the size calculations - should hopefully work better. lbt */
555 dump->magic = CMAGIC;
556 dump->start_code = 0;
557 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
558 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
559 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
560 dump->u_dsize -= dump->u_tsize;
561 dump->u_ssize = 0;
562 for (i = 0; i < 8; i++)
563 dump->u_debugreg[i] = current->thread.debugreg[i];
565 if (dump->start_stack < TASK_SIZE)
566 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
568 dump->regs.ebx = regs->ebx;
569 dump->regs.ecx = regs->ecx;
570 dump->regs.edx = regs->edx;
571 dump->regs.esi = regs->esi;
572 dump->regs.edi = regs->edi;
573 dump->regs.ebp = regs->ebp;
574 dump->regs.eax = regs->eax;
575 dump->regs.ds = regs->xds;
576 dump->regs.es = regs->xes;
577 savesegment(fs,dump->regs.fs);
578 savesegment(gs,dump->regs.gs);
579 dump->regs.orig_eax = regs->orig_eax;
580 dump->regs.eip = regs->eip;
581 dump->regs.cs = regs->xcs;
582 dump->regs.eflags = regs->eflags;
583 dump->regs.esp = regs->esp;
584 dump->regs.ss = regs->xss;
586 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
590 * This special macro can be used to load a debugging register
592 #define loaddebug(thread,register) \
593 __asm__("movl %0,%%db" #register \
594 : /* no output */ \
595 :"r" (thread->debugreg[register]))
598 * switch_to(x,yn) should switch tasks from x to y.
600 * We fsave/fwait so that an exception goes off at the right time
601 * (as a call from the fsave or fwait in effect) rather than to
602 * the wrong process. Lazy FP saving no longer makes any sense
603 * with modern CPU's, and this simplifies a lot of things (SMP
604 * and UP become the same).
606 * NOTE! We used to use the x86 hardware context switching. The
607 * reason for not using it any more becomes apparent when you
608 * try to recover gracefully from saved state that is no longer
609 * valid (stale segment register values in particular). With the
610 * hardware task-switch, there is no way to fix up bad state in
611 * a reasonable manner.
613 * The fact that Intel documents the hardware task-switching to
614 * be slow is a fairly red herring - this code is not noticeably
615 * faster. However, there _is_ some room for improvement here,
616 * so the performance issues may eventually be a valid point.
617 * More important, however, is the fact that this allows us much
618 * more flexibility.
620 void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
622 struct thread_struct *prev = &prev_p->thread,
623 *next = &next_p->thread;
624 struct tss_struct *tss = init_tss + smp_processor_id();
626 unlazy_fpu(prev_p);
629 * Reload esp0, LDT and the page table pointer:
631 tss->esp0 = next->esp0;
634 * Save away %fs and %gs. No need to save %es and %ds, as
635 * those are always kernel segments while inside the kernel.
637 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
638 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
641 * Restore %fs and %gs.
643 loadsegment(fs, next->fs);
644 loadsegment(gs, next->gs);
647 * Now maybe reload the debug registers
649 if (next->debugreg[7]){
650 loaddebug(next, 0);
651 loaddebug(next, 1);
652 loaddebug(next, 2);
653 loaddebug(next, 3);
654 /* no 4 and 5 */
655 loaddebug(next, 6);
656 loaddebug(next, 7);
659 if (prev->ioperm || next->ioperm) {
660 if (next->ioperm) {
662 * 4 cachelines copy ... not good, but not that
663 * bad either. Anyone got something better?
664 * This only affects processes which use ioperm().
665 * [Putting the TSSs into 4k-tlb mapped regions
666 * and playing VM tricks to switch the IO bitmap
667 * is not really acceptable.]
669 memcpy(tss->io_bitmap, next->io_bitmap,
670 IO_BITMAP_SIZE*sizeof(unsigned long));
671 tss->bitmap = IO_BITMAP_OFFSET;
672 } else
674 * a bitmap offset pointing outside of the TSS limit
675 * causes a nicely controllable SIGSEGV if a process
676 * tries to use a port IO instruction. The first
677 * sys_ioperm() call sets up the bitmap properly.
679 tss->bitmap = INVALID_IO_BITMAP_OFFSET;
683 asmlinkage int sys_fork(struct pt_regs regs)
685 return do_fork(SIGCHLD, regs.esp, &regs, 0);
688 asmlinkage int sys_clone(struct pt_regs regs)
690 unsigned long clone_flags;
691 unsigned long newsp;
693 clone_flags = regs.ebx;
694 newsp = regs.ecx;
695 if (!newsp)
696 newsp = regs.esp;
697 return do_fork(clone_flags, newsp, &regs, 0);
701 * This is trivial, and on the face of it looks like it
702 * could equally well be done in user mode.
704 * Not so, for quite unobvious reasons - register pressure.
705 * In user mode vfork() cannot have a stack frame, and if
706 * done by calling the "clone()" system call directly, you
707 * do not have enough call-clobbered registers to hold all
708 * the information you need.
710 asmlinkage int sys_vfork(struct pt_regs regs)
712 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0);
716 * sys_execve() executes a new program.
718 asmlinkage int sys_execve(struct pt_regs regs)
720 int error;
721 char * filename;
723 filename = getname((char *) regs.ebx);
724 error = PTR_ERR(filename);
725 if (IS_ERR(filename))
726 goto out;
727 error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, &regs);
728 if (error == 0)
729 current->ptrace &= ~PT_DTRACE;
730 putname(filename);
731 out:
732 return error;
736 * These bracket the sleeping functions..
738 extern void scheduling_functions_start_here(void);
739 extern void scheduling_functions_end_here(void);
740 #define first_sched ((unsigned long) scheduling_functions_start_here)
741 #define last_sched ((unsigned long) scheduling_functions_end_here)
743 unsigned long get_wchan(struct task_struct *p)
745 unsigned long ebp, esp, eip;
746 unsigned long stack_page;
747 int count = 0;
748 if (!p || p == current || p->state == TASK_RUNNING)
749 return 0;
750 stack_page = (unsigned long)p;
751 esp = p->thread.esp;
752 if (!stack_page || esp < stack_page || esp > 8188+stack_page)
753 return 0;
754 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
755 ebp = *(unsigned long *) esp;
756 do {
757 if (ebp < stack_page || ebp > 8184+stack_page)
758 return 0;
759 eip = *(unsigned long *) (ebp+4);
760 if (eip < first_sched || eip >= last_sched)
761 return eip;
762 ebp = *(unsigned long *) ebp;
763 } while (count++ < 16);
764 return 0;
766 #undef last_sched
767 #undef first_sched