Import 2.3.40pre5
[davej-history.git] / arch / i386 / mm / init.c
blobb99daee8460ba65f17f48bbeab735ce18cc14b47
1 /*
2 * linux/arch/i386/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 */
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #ifdef CONFIG_BLK_DEV_INITRD
23 #include <linux/blk.h>
24 #endif
25 #include <linux/highmem.h>
26 #include <linux/pagemap.h>
27 #include <linux/bootmem.h>
29 #include <asm/processor.h>
30 #include <asm/system.h>
31 #include <asm/uaccess.h>
32 #include <asm/pgtable.h>
33 #include <asm/pgalloc.h>
34 #include <asm/dma.h>
35 #include <asm/fixmap.h>
36 #include <asm/e820.h>
38 unsigned long highstart_pfn, highend_pfn;
39 static unsigned long totalram_pages = 0;
40 static unsigned long totalhigh_pages = 0;
42 extern void show_net_buffers(void);
45 * BAD_PAGE is the page that is used for page faults when linux
46 * is out-of-memory. Older versions of linux just did a
47 * do_exit(), but using this instead means there is less risk
48 * for a process dying in kernel mode, possibly leaving an inode
49 * unused etc..
51 * BAD_PAGETABLE is the accompanying page-table: it is initialized
52 * to point to BAD_PAGE entries.
54 * ZERO_PAGE is a special page that is used for zero-initialized
55 * data and COW.
59 * These are allocated in head.S so that we get proper page alignment.
60 * If you change the size of these then change head.S as well.
62 extern char empty_bad_page[PAGE_SIZE];
63 #if CONFIG_X86_PAE
64 extern pmd_t empty_bad_pmd_table[PTRS_PER_PMD];
65 #endif
66 extern pte_t empty_bad_pte_table[PTRS_PER_PTE];
69 * We init them before every return and make them writable-shared.
70 * This guarantees we get out of the kernel in some more or less sane
71 * way.
73 #if CONFIG_X86_PAE
74 static pmd_t * get_bad_pmd_table(void)
76 pmd_t v;
77 int i;
79 set_pmd(&v, __pmd(_PAGE_TABLE + __pa(empty_bad_pte_table)));
81 for (i = 0; i < PAGE_SIZE/sizeof(pmd_t); i++)
82 empty_bad_pmd_table[i] = v;
84 return empty_bad_pmd_table;
86 #endif
88 static pte_t * get_bad_pte_table(void)
90 pte_t v;
91 int i;
93 v = pte_mkdirty(mk_pte_phys(__pa(empty_bad_page), PAGE_SHARED));
95 for (i = 0; i < PAGE_SIZE/sizeof(pte_t); i++)
96 empty_bad_pte_table[i] = v;
98 return empty_bad_pte_table;
103 void __handle_bad_pmd(pmd_t *pmd)
105 pmd_ERROR(*pmd);
106 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(get_bad_pte_table())));
109 void __handle_bad_pmd_kernel(pmd_t *pmd)
111 pmd_ERROR(*pmd);
112 set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(get_bad_pte_table())));
115 pte_t *get_pte_kernel_slow(pmd_t *pmd, unsigned long offset)
117 pte_t *pte;
119 pte = (pte_t *) __get_free_page(GFP_KERNEL);
120 if (pmd_none(*pmd)) {
121 if (pte) {
122 clear_page(pte);
123 set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
124 return pte + offset;
126 set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(get_bad_pte_table())));
127 return NULL;
129 free_page((unsigned long)pte);
130 if (pmd_bad(*pmd)) {
131 __handle_bad_pmd_kernel(pmd);
132 return NULL;
134 return (pte_t *) pmd_page(*pmd) + offset;
137 pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset)
139 unsigned long pte;
141 pte = (unsigned long) __get_free_page(GFP_KERNEL);
142 if (pmd_none(*pmd)) {
143 if (pte) {
144 clear_page((void *)pte);
145 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));
146 return (pte_t *)pte + offset;
148 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(get_bad_pte_table())));
149 return NULL;
151 free_page(pte);
152 if (pmd_bad(*pmd)) {
153 __handle_bad_pmd(pmd);
154 return NULL;
156 return (pte_t *) pmd_page(*pmd) + offset;
159 int do_check_pgt_cache(int low, int high)
161 int freed = 0;
162 if(pgtable_cache_size > high) {
163 do {
164 if(pgd_quicklist)
165 free_pgd_slow(get_pgd_fast()), freed++;
166 if(pmd_quicklist)
167 free_pmd_slow(get_pmd_fast()), freed++;
168 if(pte_quicklist)
169 free_pte_slow(get_pte_fast()), freed++;
170 } while(pgtable_cache_size > low);
172 return freed;
176 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
177 * physical space so we can cache the place of the first one and move
178 * around without checking the pgd every time.
181 #if CONFIG_HIGHMEM
182 pte_t *kmap_pte;
183 pgprot_t kmap_prot;
185 #define kmap_get_fixmap_pte(vaddr) \
186 pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
188 void __init kmap_init(void)
190 unsigned long kmap_vstart;
192 /* cache the first kmap pte */
193 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
194 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
196 kmap_prot = PAGE_KERNEL;
197 if (boot_cpu_data.x86_capability & X86_FEATURE_PGE)
198 pgprot_val(kmap_prot) |= _PAGE_GLOBAL;
200 #endif
202 void show_mem(void)
204 int i,free = 0, total = 0, reserved = 0;
205 int shared = 0, cached = 0;
206 int highmem = 0;
208 printk("Mem-info:\n");
209 show_free_areas();
210 printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
211 i = max_mapnr;
212 while (i-- > 0) {
213 total++;
214 if (PageHighMem(mem_map+i))
215 highmem++;
216 if (PageReserved(mem_map+i))
217 reserved++;
218 else if (PageSwapCache(mem_map+i))
219 cached++;
220 else if (!page_count(mem_map+i))
221 free++;
222 else
223 shared += page_count(mem_map+i) - 1;
225 printk("%d pages of RAM\n", total);
226 printk("%d pages of HIGHMEM\n",highmem);
227 printk("%d reserved pages\n",reserved);
228 printk("%d pages shared\n",shared);
229 printk("%d pages swap cached\n",cached);
230 printk("%ld pages in page table cache\n",pgtable_cache_size);
231 show_buffers();
232 #ifdef CONFIG_NET
233 show_net_buffers();
234 #endif
237 /* References to section boundaries */
239 extern char _text, _etext, _edata, __bss_start, _end;
240 extern char __init_begin, __init_end;
242 static void set_pte_phys (unsigned long vaddr, unsigned long phys)
244 pgprot_t prot;
245 pgd_t *pgd;
246 pmd_t *pmd;
247 pte_t *pte;
249 pgd = swapper_pg_dir + __pgd_offset(vaddr);
250 pmd = pmd_offset(pgd, vaddr);
251 pte = pte_offset(pmd, vaddr);
252 prot = PAGE_KERNEL;
253 if (boot_cpu_data.x86_capability & X86_FEATURE_PGE)
254 pgprot_val(prot) |= _PAGE_GLOBAL;
255 set_pte(pte, mk_pte_phys(phys, prot));
258 * It's enough to flush this one mapping.
260 __flush_tlb_one(vaddr);
263 void set_fixmap (enum fixed_addresses idx, unsigned long phys)
265 unsigned long address = __fix_to_virt(idx);
267 if (idx >= __end_of_fixed_addresses) {
268 printk("Invalid set_fixmap\n");
269 return;
271 set_pte_phys(address,phys);
274 static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
276 pgd_t *pgd;
277 pmd_t *pmd;
278 pte_t *pte;
279 int i, j;
281 i = __pgd_offset(start);
282 j = __pmd_offset(start);
283 pgd = pgd_base + i;
285 for ( ; (i < PTRS_PER_PGD) && (start != end); pgd++, i++) {
286 #if CONFIG_X86_PAE
287 if (pgd_none(*pgd)) {
288 pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
289 set_pgd(pgd, __pgd(__pa(pmd) + 0x1));
290 if (pmd != pmd_offset(pgd, start))
291 BUG();
293 pmd = pmd_offset(pgd, start);
294 #else
295 pmd = (pmd_t *)pgd;
296 #endif
297 for (; (j < PTRS_PER_PMD) && start; pmd++, j++) {
298 if (pmd_none(*pmd)) {
299 pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
300 set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
301 if (pte != pte_offset(pmd, 0))
302 BUG();
304 start += PMD_SIZE;
306 j = 0;
310 static void __init pagetable_init(void)
312 pgd_t *pgd, *pgd_base;
313 pmd_t *pmd;
314 pte_t *pte;
315 int i, j, k;
316 unsigned long vaddr;
317 unsigned long end = (unsigned long)__va(max_low_pfn*PAGE_SIZE);
319 pgd_base = swapper_pg_dir;
321 vaddr = PAGE_OFFSET;
322 i = __pgd_offset(vaddr);
323 pgd = pgd_base + i;
325 for (; (i < PTRS_PER_PGD) && (vaddr <= end); pgd++, i++) {
326 vaddr = i*PGDIR_SIZE;
327 #if CONFIG_X86_PAE
328 pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
329 set_pgd(pgd, __pgd(__pa(pmd) + 0x1));
330 #else
331 pmd = (pmd_t *)pgd;
332 #endif
333 if (pmd != pmd_offset(pgd, 0))
334 BUG();
335 for (j = 0; (j < PTRS_PER_PMD) && (vaddr <= end); pmd++, j++) {
336 vaddr = i*PGDIR_SIZE + j*PMD_SIZE;
337 if (cpu_has_pse) {
338 unsigned long __pe;
340 set_in_cr4(X86_CR4_PSE);
341 boot_cpu_data.wp_works_ok = 1;
342 __pe = _KERNPG_TABLE + _PAGE_PSE + __pa(vaddr);
343 /* Make it "global" too if supported */
344 if (cpu_has_pge) {
345 set_in_cr4(X86_CR4_PGE);
346 __pe += _PAGE_GLOBAL;
348 set_pmd(pmd, __pmd(__pe));
349 continue;
352 pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
353 set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
355 if (pte != pte_offset(pmd, 0))
356 BUG();
358 for (k = 0;
359 (k < PTRS_PER_PTE) && (vaddr <= end);
360 pte++, k++) {
361 vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE;
362 *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
368 * Fixed mappings, only the page table structure has to be
369 * created - mappings will be set by set_fixmap():
371 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
372 fixrange_init(vaddr, 0, pgd_base);
374 #if CONFIG_HIGHMEM
376 * Permanent kmaps:
378 vaddr = PKMAP_BASE;
379 fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
381 pgd = swapper_pg_dir + __pgd_offset(vaddr);
382 pmd = pmd_offset(pgd, vaddr);
383 pte = pte_offset(pmd, vaddr);
384 pkmap_page_table = pte;
385 #endif
387 #if CONFIG_X86_PAE
389 * Add low memory identity-mappings - SMP needs it when
390 * starting up on an AP from real-mode. In the non-PAE
391 * case we already have these mappings through head.S.
392 * All user-space mappings are explicitly cleared after
393 * SMP startup.
395 pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
396 #endif
399 void __init zap_low_mappings (void)
401 int i;
403 * Zap initial low-memory mappings.
405 * Note that "pgd_clear()" doesn't do it for
406 * us in this case, because pgd_clear() is a
407 * no-op in the 2-level case (pmd_clear() is
408 * the thing that clears the page-tables in
409 * that case).
411 for (i = 0; i < USER_PTRS_PER_PGD; i++)
412 #if CONFIG_X86_PAE
413 pgd_clear(swapper_pg_dir+i);
414 #else
415 set_pgd(swapper_pg_dir+i, __pgd(0));
416 #endif
417 flush_tlb_all();
421 * paging_init() sets up the page tables - note that the first 4MB are
422 * already mapped by head.S.
424 * This routines also unmaps the page at virtual kernel address 0, so
425 * that we can trap those pesky NULL-reference errors in the kernel.
427 void __init paging_init(void)
429 pagetable_init();
431 __asm__( "movl %%ecx,%%cr3\n" ::"c"(__pa(swapper_pg_dir)));
433 #if CONFIG_X86_PAE
435 * We will bail out later - printk doesnt work right now so
436 * the user would just see a hanging kernel.
438 if (cpu_has_pae)
439 set_in_cr4(X86_CR4_PAE);
440 #endif
442 __flush_tlb();
444 #ifdef __SMP__
445 init_smp_mappings();
446 #endif
448 #ifdef CONFIG_HIGHMEM
449 kmap_init();
450 #endif
452 unsigned int zones_size[MAX_NR_ZONES] = {0, 0, 0};
453 unsigned int max_dma, high, low;
455 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
456 low = max_low_pfn;
457 high = highend_pfn;
459 if (low < max_dma)
460 zones_size[ZONE_DMA] = low;
461 else {
462 zones_size[ZONE_DMA] = max_dma;
463 zones_size[ZONE_NORMAL] = low - max_dma;
464 #ifdef CONFIG_HIGHMEM
465 zones_size[ZONE_HIGHMEM] = high - low;
466 #endif
468 free_area_init(zones_size);
470 return;
474 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
475 * and also on some strange 486's (NexGen etc.). All 586+'s are OK. The jumps
476 * before and after the test are here to work-around some nasty CPU bugs.
479 void __init test_wp_bit(void)
482 * Ok, all PAE-capable CPUs are definitely handling the WP bit right.
484 const unsigned long vaddr = PAGE_OFFSET;
485 pgd_t *pgd;
486 pmd_t *pmd;
487 pte_t *pte, old_pte;
488 char tmp_reg;
490 printk("Checking if this processor honours the WP bit even in supervisor mode... ");
492 pgd = swapper_pg_dir + __pgd_offset(vaddr);
493 pmd = pmd_offset(pgd, vaddr);
494 pte = pte_offset(pmd, vaddr);
495 old_pte = *pte;
496 *pte = mk_pte_phys(0, PAGE_READONLY);
497 local_flush_tlb();
499 __asm__ __volatile__(
500 "jmp 1f; 1:\n"
501 "movb %0,%1\n"
502 "movb %1,%0\n"
503 "jmp 1f; 1:\n"
504 :"=m" (*(char *) vaddr),
505 "=q" (tmp_reg)
506 :/* no inputs */
507 :"memory");
509 *pte = old_pte;
510 local_flush_tlb();
512 if (boot_cpu_data.wp_works_ok < 0) {
513 boot_cpu_data.wp_works_ok = 0;
514 printk("No.\n");
515 #ifdef CONFIG_X86_WP_WORKS_OK
516 panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
517 #endif
518 } else
519 printk(".\n");
522 static inline int page_is_ram (unsigned long pagenr)
524 int i;
526 for (i = 0; i < e820.nr_map; i++) {
527 unsigned long addr, end;
529 if (e820.map[i].type != E820_RAM) /* not usable memory */
530 continue;
532 * !!!FIXME!!! Some BIOSen report areas as RAM that
533 * are not. Notably the 640->1Mb area. We need a sanity
534 * check here.
536 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
537 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
538 if ((pagenr >= addr) && (pagenr < end))
539 return 1;
541 return 0;
544 void __init mem_init(void)
546 int codesize, reservedpages, datasize, initsize;
547 int tmp;
549 if (!mem_map)
550 BUG();
552 #ifdef CONFIG_HIGHMEM
553 highmem_start_page = mem_map + highstart_pfn;
554 /* cache the highmem_mapnr */
555 highmem_mapnr = highstart_pfn;
556 max_mapnr = num_physpages = highend_pfn;
557 #else
558 max_mapnr = num_physpages = max_low_pfn;
559 #endif
560 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
562 /* clear the zero-page */
563 memset(empty_zero_page, 0, PAGE_SIZE);
565 /* this will put all low memory onto the freelists */
566 totalram_pages += free_all_bootmem();
568 reservedpages = 0;
569 for (tmp = 0; tmp < max_low_pfn; tmp++)
571 * Only count reserved RAM pages
573 if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
574 reservedpages++;
575 #ifdef CONFIG_HIGHMEM
576 for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) {
577 struct page *page = mem_map + tmp;
579 if (!page_is_ram(tmp)) {
580 SetPageReserved(page);
581 continue;
583 ClearPageReserved(page);
584 set_bit(PG_highmem, &page->flags);
585 atomic_set(&page->count, 1);
586 __free_page(page);
587 totalhigh_pages++;
589 totalram_pages += totalhigh_pages;
590 #endif
591 codesize = (unsigned long) &_etext - (unsigned long) &_text;
592 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
593 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
595 printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
596 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
597 max_mapnr << (PAGE_SHIFT-10),
598 codesize >> 10,
599 reservedpages << (PAGE_SHIFT-10),
600 datasize >> 10,
601 initsize >> 10,
602 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
605 #if CONFIG_X86_PAE
606 if (!cpu_has_pae)
607 panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
608 #endif
609 if (boot_cpu_data.wp_works_ok < 0)
610 test_wp_bit();
613 * Subtle. SMP is doing it's boot stuff late (because it has to
614 * fork idle threads) - but it also needs low mappings for the
615 * protected-mode entry to work. We zap these entries only after
616 * the WP-bit has been tested.
618 #ifndef CONFIG_SMP
619 zap_low_mappings();
620 #endif
624 void free_initmem(void)
626 unsigned long addr;
628 addr = (unsigned long)(&__init_begin);
629 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
630 ClearPageReserved(mem_map + MAP_NR(addr));
631 set_page_count(mem_map+MAP_NR(addr), 1);
632 free_page(addr);
633 totalram_pages++;
635 printk ("Freeing unused kernel memory: %dk freed\n", (&__init_end - &__init_begin) >> 10);
638 #ifdef CONFIG_BLK_DEV_INITRD
639 void free_initrd_mem(unsigned long start, unsigned long end)
641 for (; start < end; start += PAGE_SIZE) {
642 ClearPageReserved(mem_map + MAP_NR(start));
643 set_page_count(mem_map+MAP_NR(start), 1);
644 free_page(start);
645 totalram_pages++;
647 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
649 #endif
651 void si_meminfo(struct sysinfo *val)
653 val->totalram = totalram_pages;
654 val->sharedram = 0;
655 val->freeram = nr_free_pages();
656 val->bufferram = atomic_read(&buffermem_pages);
657 val->totalhigh = totalhigh_pages;
658 val->freehigh = nr_free_highpages();
659 val->mem_unit = PAGE_SIZE;
660 return;