Import 2.1.122pre3
[davej-history.git] / mm / memory.c
blobb666c0191a9cbf2cc5aa061b5d4d5b1d2f43f73f
1 /*
2 * linux/mm/memory.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
36 #include <linux/signal.h>
37 #include <linux/sched.h>
38 #include <linux/kernel.h>
39 #include <linux/errno.h>
40 #include <linux/string.h>
41 #include <linux/types.h>
42 #include <linux/ptrace.h>
43 #include <linux/mman.h>
44 #include <linux/mm.h>
45 #include <linux/swap.h>
46 #include <linux/smp.h>
47 #include <linux/smp_lock.h>
49 #include <asm/system.h>
50 #include <asm/uaccess.h>
51 #include <asm/pgtable.h>
52 #include <asm/string.h>
54 unsigned long max_mapnr = 0;
55 unsigned long num_physpages = 0;
56 void * high_memory = NULL;
59 * We special-case the C-O-W ZERO_PAGE, because it's such
60 * a common occurrence (no need to read the page to know
61 * that it's zero - better for the cache and memory subsystem).
63 static inline void copy_cow_page(unsigned long from, unsigned long to)
65 if (from == ZERO_PAGE) {
66 clear_page(to);
67 return;
69 copy_page(to, from);
72 mem_map_t * mem_map = NULL;
75 * oom() prints a message (so that the user knows why the process died),
76 * and gives the process an untrappable SIGKILL.
78 void oom(struct task_struct * task)
80 printk("\nOut of memory for %s.\n", task->comm);
81 force_sig(SIGKILL, task);
85 * Note: this doesn't free the actual pages themselves. That
86 * has been handled earlier when unmapping all the memory regions.
88 static inline void free_one_pmd(pmd_t * dir)
90 pte_t * pte;
92 if (pmd_none(*dir))
93 return;
94 if (pmd_bad(*dir)) {
95 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
96 pmd_clear(dir);
97 return;
99 pte = pte_offset(dir, 0);
100 pmd_clear(dir);
101 pte_free(pte);
104 static inline void free_one_pgd(pgd_t * dir)
106 int j;
107 pmd_t * pmd;
109 if (pgd_none(*dir))
110 return;
111 if (pgd_bad(*dir)) {
112 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
113 pgd_clear(dir);
114 return;
116 pmd = pmd_offset(dir, 0);
117 pgd_clear(dir);
118 for (j = 0; j < PTRS_PER_PMD ; j++)
119 free_one_pmd(pmd+j);
120 pmd_free(pmd);
123 /* Low and high watermarks for page table cache.
124 The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
126 int pgt_cache_water[2] = { 25, 50 };
128 /* Returns the number of pages freed */
129 int check_pgt_cache(void)
131 return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
136 * This function clears all user-level page tables of a process - this
137 * is needed by execve(), so that old pages aren't in the way.
139 void clear_page_tables(struct task_struct * tsk)
141 pgd_t * page_dir = tsk->mm->pgd;
142 int i;
144 if (!page_dir || page_dir == swapper_pg_dir)
145 goto out_bad;
146 for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
147 free_one_pgd(page_dir + i);
149 /* keep the page table cache within bounds */
150 check_pgt_cache();
151 return;
153 out_bad:
154 printk(KERN_ERR
155 "clear_page_tables: %s trying to clear kernel pgd\n",
156 tsk->comm);
157 return;
161 * This function frees up all page tables of a process when it exits. It
162 * is the same as "clear_page_tables()", except it also frees the old
163 * page table directory.
165 void free_page_tables(struct mm_struct * mm)
167 pgd_t * page_dir = mm->pgd;
168 int i;
170 if (!page_dir)
171 goto out;
172 if (page_dir == swapper_pg_dir)
173 goto out_bad;
174 for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
175 free_one_pgd(page_dir + i);
176 pgd_free(page_dir);
178 /* keep the page table cache within bounds */
179 check_pgt_cache();
180 out:
181 return;
183 out_bad:
184 printk(KERN_ERR
185 "free_page_tables: Trying to free kernel pgd\n");
186 return;
189 int new_page_tables(struct task_struct * tsk)
191 pgd_t * new_pg;
193 if (!(new_pg = pgd_alloc()))
194 return -ENOMEM;
195 SET_PAGE_DIR(tsk, new_pg);
196 tsk->mm->pgd = new_pg;
197 return 0;
200 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
201 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
204 * copy one vm_area from one task to the other. Assumes the page tables
205 * already present in the new task to be cleared in the whole range
206 * covered by this vma.
208 * 08Jan98 Merged into one routine from several inline routines to reduce
209 * variable count and make things faster. -jj
211 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
212 struct vm_area_struct *vma)
214 pgd_t * src_pgd, * dst_pgd;
215 unsigned long address = vma->vm_start;
216 unsigned long end = vma->vm_end;
217 unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
219 src_pgd = pgd_offset(src, address)-1;
220 dst_pgd = pgd_offset(dst, address)-1;
222 for (;;) {
223 pmd_t * src_pmd, * dst_pmd;
225 src_pgd++; dst_pgd++;
227 /* copy_pmd_range */
229 if (pgd_none(*src_pgd))
230 goto skip_copy_pmd_range;
231 if (pgd_bad(*src_pgd)) {
232 printk("copy_pmd_range: bad pgd (%08lx)\n",
233 pgd_val(*src_pgd));
234 pgd_clear(src_pgd);
235 skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
236 if (address >= end)
237 goto out;
238 continue;
240 if (pgd_none(*dst_pgd)) {
241 if (!pmd_alloc(dst_pgd, 0))
242 goto nomem;
245 src_pmd = pmd_offset(src_pgd, address);
246 dst_pmd = pmd_offset(dst_pgd, address);
248 do {
249 pte_t * src_pte, * dst_pte;
251 /* copy_pte_range */
253 if (pmd_none(*src_pmd))
254 goto skip_copy_pte_range;
255 if (pmd_bad(*src_pmd)) {
256 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
257 pmd_clear(src_pmd);
258 skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
259 if (address >= end)
260 goto out;
261 goto cont_copy_pmd_range;
263 if (pmd_none(*dst_pmd)) {
264 if (!pte_alloc(dst_pmd, 0))
265 goto nomem;
268 src_pte = pte_offset(src_pmd, address);
269 dst_pte = pte_offset(dst_pmd, address);
271 do {
272 pte_t pte = *src_pte;
273 unsigned long page_nr;
275 /* copy_one_pte */
277 if (pte_none(pte))
278 goto cont_copy_pte_range;
279 if (!pte_present(pte)) {
280 swap_duplicate(pte_val(pte));
281 set_pte(dst_pte, pte);
282 goto cont_copy_pte_range;
284 page_nr = MAP_NR(pte_page(pte));
285 if (page_nr >= max_mapnr ||
286 PageReserved(mem_map+page_nr)) {
287 set_pte(dst_pte, pte);
288 goto cont_copy_pte_range;
290 if (cow)
291 pte = pte_wrprotect(pte);
292 #if 0 /* No longer needed with the new swap cache code */
293 if (delete_from_swap_cache(&mem_map[page_nr]))
294 pte = pte_mkdirty(pte);
295 #endif
296 set_pte(dst_pte, pte_mkold(pte));
297 set_pte(src_pte, pte);
298 atomic_inc(&mem_map[page_nr].count);
300 cont_copy_pte_range: address += PAGE_SIZE;
301 if (address >= end)
302 goto out;
303 src_pte++;
304 dst_pte++;
305 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
307 cont_copy_pmd_range: src_pmd++;
308 dst_pmd++;
309 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
311 out:
312 return 0;
314 nomem:
315 return -ENOMEM;
319 * Return indicates whether a page was freed so caller can adjust rss
321 static inline int free_pte(pte_t page)
323 if (pte_present(page)) {
324 unsigned long addr = pte_page(page);
325 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
326 return 0;
328 * free_page() used to be able to clear swap cache
329 * entries. We may now have to do it manually.
331 free_page_and_swap_cache(addr);
332 return 1;
334 swap_free(pte_val(page));
335 return 0;
338 static inline void forget_pte(pte_t page)
340 if (!pte_none(page)) {
341 printk("forget_pte: old mapping existed!\n");
342 free_pte(page);
346 static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
348 pte_t * pte;
349 int freed;
351 if (pmd_none(*pmd))
352 return 0;
353 if (pmd_bad(*pmd)) {
354 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
355 pmd_clear(pmd);
356 return 0;
358 pte = pte_offset(pmd, address);
359 address &= ~PMD_MASK;
360 if (address + size > PMD_SIZE)
361 size = PMD_SIZE - address;
362 size >>= PAGE_SHIFT;
363 freed = 0;
364 for (;;) {
365 pte_t page;
366 if (!size)
367 break;
368 page = *pte;
369 pte++;
370 size--;
371 if (pte_none(page))
372 continue;
373 pte_clear(pte-1);
374 freed += free_pte(page);
376 return freed;
379 static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
381 pmd_t * pmd;
382 unsigned long end;
383 int freed;
385 if (pgd_none(*dir))
386 return 0;
387 if (pgd_bad(*dir)) {
388 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
389 pgd_clear(dir);
390 return 0;
392 pmd = pmd_offset(dir, address);
393 address &= ~PGDIR_MASK;
394 end = address + size;
395 if (end > PGDIR_SIZE)
396 end = PGDIR_SIZE;
397 freed = 0;
398 do {
399 freed += zap_pte_range(pmd, address, end - address);
400 address = (address + PMD_SIZE) & PMD_MASK;
401 pmd++;
402 } while (address < end);
403 return freed;
407 * remove user pages in a given range.
409 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
411 pgd_t * dir;
412 unsigned long end = address + size;
413 int freed = 0;
415 dir = pgd_offset(mm, address);
416 while (address < end) {
417 freed += zap_pmd_range(dir, address, end - address);
418 address = (address + PGDIR_SIZE) & PGDIR_MASK;
419 dir++;
422 * Update rss for the mm_struct (not necessarily current->mm)
424 if (mm->rss > 0) {
425 mm->rss -= freed;
426 if (mm->rss < 0)
427 mm->rss = 0;
431 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
433 unsigned long end;
435 address &= ~PMD_MASK;
436 end = address + size;
437 if (end > PMD_SIZE)
438 end = PMD_SIZE;
439 do {
440 pte_t oldpage = *pte;
441 set_pte(pte, zero_pte);
442 forget_pte(oldpage);
443 address += PAGE_SIZE;
444 pte++;
445 } while (address < end);
448 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
450 unsigned long end;
452 address &= ~PGDIR_MASK;
453 end = address + size;
454 if (end > PGDIR_SIZE)
455 end = PGDIR_SIZE;
456 do {
457 pte_t * pte = pte_alloc(pmd, address);
458 if (!pte)
459 return -ENOMEM;
460 zeromap_pte_range(pte, address, end - address, zero_pte);
461 address = (address + PMD_SIZE) & PMD_MASK;
462 pmd++;
463 } while (address < end);
464 return 0;
467 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
469 int error = 0;
470 pgd_t * dir;
471 unsigned long beg = address;
472 unsigned long end = address + size;
473 pte_t zero_pte;
475 zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
476 dir = pgd_offset(current->mm, address);
477 flush_cache_range(current->mm, beg, end);
478 while (address < end) {
479 pmd_t *pmd = pmd_alloc(dir, address);
480 error = -ENOMEM;
481 if (!pmd)
482 break;
483 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
484 if (error)
485 break;
486 address = (address + PGDIR_SIZE) & PGDIR_MASK;
487 dir++;
489 flush_tlb_range(current->mm, beg, end);
490 return error;
494 * maps a range of physical memory into the requested pages. the old
495 * mappings are removed. any references to nonexistent pages results
496 * in null mappings (currently treated as "copy-on-access")
498 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
499 unsigned long phys_addr, pgprot_t prot)
501 unsigned long end;
503 address &= ~PMD_MASK;
504 end = address + size;
505 if (end > PMD_SIZE)
506 end = PMD_SIZE;
507 do {
508 unsigned long mapnr;
509 pte_t oldpage = *pte;
510 pte_clear(pte);
512 mapnr = MAP_NR(__va(phys_addr));
513 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
514 set_pte(pte, mk_pte_phys(phys_addr, prot));
515 forget_pte(oldpage);
516 address += PAGE_SIZE;
517 phys_addr += PAGE_SIZE;
518 pte++;
519 } while (address < end);
522 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
523 unsigned long phys_addr, pgprot_t prot)
525 unsigned long end;
527 address &= ~PGDIR_MASK;
528 end = address + size;
529 if (end > PGDIR_SIZE)
530 end = PGDIR_SIZE;
531 phys_addr -= address;
532 do {
533 pte_t * pte = pte_alloc(pmd, address);
534 if (!pte)
535 return -ENOMEM;
536 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
537 address = (address + PMD_SIZE) & PMD_MASK;
538 pmd++;
539 } while (address < end);
540 return 0;
543 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
545 int error = 0;
546 pgd_t * dir;
547 unsigned long beg = from;
548 unsigned long end = from + size;
550 phys_addr -= from;
551 dir = pgd_offset(current->mm, from);
552 flush_cache_range(current->mm, beg, end);
553 while (from < end) {
554 pmd_t *pmd = pmd_alloc(dir, from);
555 error = -ENOMEM;
556 if (!pmd)
557 break;
558 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
559 if (error)
560 break;
561 from = (from + PGDIR_SIZE) & PGDIR_MASK;
562 dir++;
564 flush_tlb_range(current->mm, beg, end);
565 return error;
569 * sanity-check function..
571 static void put_page(pte_t * page_table, pte_t pte)
573 if (!pte_none(*page_table)) {
574 free_page_and_swap_cache(pte_page(pte));
575 return;
577 /* no need for flush_tlb */
578 set_pte(page_table, pte);
582 * This routine is used to map in a page into an address space: needed by
583 * execve() for the initial stack and environment pages.
585 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
587 pgd_t * pgd;
588 pmd_t * pmd;
589 pte_t * pte;
591 if (MAP_NR(page) >= max_mapnr)
592 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
593 if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
594 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
595 pgd = pgd_offset(tsk->mm,address);
596 pmd = pmd_alloc(pgd, address);
597 if (!pmd) {
598 free_page(page);
599 oom(tsk);
600 return 0;
602 pte = pte_alloc(pmd, address);
603 if (!pte) {
604 free_page(page);
605 oom(tsk);
606 return 0;
608 if (!pte_none(*pte)) {
609 printk("put_dirty_page: page already exists\n");
610 free_page(page);
611 return 0;
613 flush_page_to_ram(page);
614 set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
615 /* no need for flush_tlb */
616 return page;
620 * This routine handles present pages, when users try to write
621 * to a shared page. It is done by copying the page to a new address
622 * and decrementing the shared-page counter for the old page.
624 * Goto-purists beware: the only reason for goto's here is that it results
625 * in better assembly code.. The "default" path will see no jumps at all.
627 * Note that this routine assumes that the protection checks have been
628 * done by the caller (the low-level page fault routine in most cases).
629 * Thus we can safely just mark it writable once we've done any necessary
630 * COW.
632 * We also mark the page dirty at this point even though the page will
633 * change only once the write actually happens. This avoids a few races,
634 * and potentially makes it more efficient.
636 static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
637 unsigned long address, pte_t *page_table)
639 pte_t pte;
640 unsigned long old_page, new_page;
641 struct page * page_map;
643 pte = *page_table;
644 new_page = __get_free_page(GFP_KERNEL);
645 /* Did someone else copy this page for us while we slept? */
646 if (pte_val(*page_table) != pte_val(pte))
647 goto end_wp_page;
648 if (!pte_present(pte))
649 goto end_wp_page;
650 if (pte_write(pte))
651 goto end_wp_page;
652 old_page = pte_page(pte);
653 if (MAP_NR(old_page) >= max_mapnr)
654 goto bad_wp_page;
655 tsk->min_flt++;
656 page_map = mem_map + MAP_NR(old_page);
659 * Do we need to copy?
661 if (is_page_shared(page_map)) {
662 if (new_page) {
663 if (PageReserved(mem_map + MAP_NR(old_page)))
664 ++vma->vm_mm->rss;
665 copy_cow_page(old_page,new_page);
666 flush_page_to_ram(old_page);
667 flush_page_to_ram(new_page);
668 flush_cache_page(vma, address);
669 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
670 free_page(old_page);
671 flush_tlb_page(vma, address);
672 return;
674 flush_cache_page(vma, address);
675 set_pte(page_table, BAD_PAGE);
676 flush_tlb_page(vma, address);
677 free_page(old_page);
678 oom(tsk);
679 return;
681 if (PageSwapCache(page_map))
682 delete_from_swap_cache(page_map);
683 flush_cache_page(vma, address);
684 set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
685 flush_tlb_page(vma, address);
686 if (new_page)
687 free_page(new_page);
688 return;
689 bad_wp_page:
690 printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
691 send_sig(SIGKILL, tsk, 1);
692 end_wp_page:
693 if (new_page)
694 free_page(new_page);
695 return;
699 * This function zeroes out partial mmap'ed pages at truncation time..
701 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
703 pgd_t *page_dir;
704 pmd_t *page_middle;
705 pte_t *page_table, pte;
707 page_dir = pgd_offset(vma->vm_mm, address);
708 if (pgd_none(*page_dir))
709 return;
710 if (pgd_bad(*page_dir)) {
711 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
712 pgd_clear(page_dir);
713 return;
715 page_middle = pmd_offset(page_dir, address);
716 if (pmd_none(*page_middle))
717 return;
718 if (pmd_bad(*page_middle)) {
719 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
720 pmd_clear(page_middle);
721 return;
723 page_table = pte_offset(page_middle, address);
724 pte = *page_table;
725 if (!pte_present(pte))
726 return;
727 flush_cache_page(vma, address);
728 address &= ~PAGE_MASK;
729 address += pte_page(pte);
730 if (MAP_NR(address) >= max_mapnr)
731 return;
732 memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
733 flush_page_to_ram(pte_page(pte));
737 * Handle all mappings that got truncated by a "truncate()"
738 * system call.
740 * NOTE! We have to be ready to update the memory sharing
741 * between the file and the memory map for a potential last
742 * incomplete page. Ugly, but necessary.
744 void vmtruncate(struct inode * inode, unsigned long offset)
746 struct vm_area_struct * mpnt;
748 truncate_inode_pages(inode, offset);
749 if (!inode->i_mmap)
750 return;
751 mpnt = inode->i_mmap;
752 do {
753 struct mm_struct *mm = mpnt->vm_mm;
754 unsigned long start = mpnt->vm_start;
755 unsigned long end = mpnt->vm_end;
756 unsigned long len = end - start;
757 unsigned long diff;
759 /* mapping wholly truncated? */
760 if (mpnt->vm_offset >= offset) {
761 flush_cache_range(mm, start, end);
762 zap_page_range(mm, start, len);
763 flush_tlb_range(mm, start, end);
764 continue;
766 /* mapping wholly unaffected? */
767 diff = offset - mpnt->vm_offset;
768 if (diff >= len)
769 continue;
770 /* Ok, partially affected.. */
771 start += diff;
772 len = (len - diff) & PAGE_MASK;
773 if (start & ~PAGE_MASK) {
774 partial_clear(mpnt, start);
775 start = (start + ~PAGE_MASK) & PAGE_MASK;
777 flush_cache_range(mm, start, end);
778 zap_page_range(mm, start, len);
779 flush_tlb_range(mm, start, end);
780 } while ((mpnt = mpnt->vm_next_share) != NULL);
784 static inline void do_swap_page(struct task_struct * tsk,
785 struct vm_area_struct * vma, unsigned long address,
786 pte_t * page_table, pte_t entry, int write_access)
788 pte_t page;
790 if (!vma->vm_ops || !vma->vm_ops->swapin) {
791 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
792 flush_page_to_ram(pte_page(*page_table));
793 return;
795 page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
796 if (pte_val(*page_table) != pte_val(entry)) {
797 free_page(pte_page(page));
798 return;
800 if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
801 !(vma->vm_flags & VM_SHARED))
802 page = pte_wrprotect(page);
803 ++vma->vm_mm->rss;
804 ++tsk->maj_flt;
805 flush_page_to_ram(pte_page(page));
806 set_pte(page_table, page);
807 return;
811 * do_no_page() tries to create a new page mapping. It aggressively
812 * tries to share with existing pages, but makes a separate copy if
813 * the "write_access" parameter is true in order to avoid the next
814 * page fault.
816 * As this is called only for pages that do not currently exist, we
817 * do not need to flush old virtual caches or the TLB.
819 static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
820 unsigned long address, int write_access, pte_t *page_table, pte_t entry)
822 unsigned long page;
824 if (!pte_none(entry))
825 goto swap_page;
826 address &= PAGE_MASK;
827 if (!vma->vm_ops || !vma->vm_ops->nopage)
828 goto anonymous_page;
830 * The third argument is "no_share", which tells the low-level code
831 * to copy, not share the page even if sharing is possible. It's
832 * essentially an early COW detection
834 page = vma->vm_ops->nopage(vma, address,
835 (vma->vm_flags & VM_SHARED)?0:write_access);
836 if (!page)
837 goto sigbus;
838 ++tsk->maj_flt;
839 ++vma->vm_mm->rss;
841 * This silly early PAGE_DIRTY setting removes a race
842 * due to the bad i386 page protection. But it's valid
843 * for other architectures too.
845 * Note that if write_access is true, we either now have
846 * an exclusive copy of the page, or this is a shared mapping,
847 * so we can make it writable and dirty to avoid having to
848 * handle that later.
850 flush_page_to_ram(page);
851 entry = mk_pte(page, vma->vm_page_prot);
852 if (write_access) {
853 entry = pte_mkwrite(pte_mkdirty(entry));
854 } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
855 !(vma->vm_flags & VM_SHARED))
856 entry = pte_wrprotect(entry);
857 put_page(page_table, entry);
858 /* no need to invalidate: a not-present page shouldn't be cached */
859 return;
861 anonymous_page:
862 entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
863 if (write_access) {
864 unsigned long page = __get_free_page(GFP_KERNEL);
865 if (!page)
866 goto sigbus;
867 clear_page(page);
868 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
869 vma->vm_mm->rss++;
870 tsk->min_flt++;
871 flush_page_to_ram(page);
873 put_page(page_table, entry);
874 return;
876 sigbus:
877 force_sig(SIGBUS, current);
878 put_page(page_table, BAD_PAGE);
879 /* no need to invalidate, wasn't present */
880 return;
882 swap_page:
883 do_swap_page(tsk, vma, address, page_table, entry, write_access);
884 return;
888 * These routines also need to handle stuff like marking pages dirty
889 * and/or accessed for architectures that don't do it in hardware (most
890 * RISC architectures). The early dirtying is also good on the i386.
892 * There is also a hook called "update_mmu_cache()" that architectures
893 * with external mmu caches can use to update those (ie the Sparc or
894 * PowerPC hashed page tables that act as extended TLBs).
896 static inline void handle_pte_fault(struct task_struct *tsk,
897 struct vm_area_struct * vma, unsigned long address,
898 int write_access, pte_t * pte)
900 pte_t entry = *pte;
902 if (!pte_present(entry)) {
903 do_no_page(tsk, vma, address, write_access, pte, entry);
904 return;
906 entry = pte_mkyoung(entry);
907 set_pte(pte, entry);
908 flush_tlb_page(vma, address);
909 if (!write_access)
910 return;
911 if (pte_write(entry)) {
912 entry = pte_mkdirty(entry);
913 set_pte(pte, entry);
914 flush_tlb_page(vma, address);
915 return;
917 do_wp_page(tsk, vma, address, pte);
921 * By the time we get here, we already hold the mm semaphore
923 void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
924 unsigned long address, int write_access)
926 pgd_t *pgd;
927 pmd_t *pmd;
928 pte_t *pte;
930 pgd = pgd_offset(vma->vm_mm, address);
931 pmd = pmd_alloc(pgd, address);
932 if (!pmd)
933 goto no_memory;
934 pte = pte_alloc(pmd, address);
935 if (!pte)
936 goto no_memory;
937 lock_kernel();
938 handle_pte_fault(tsk, vma, address, write_access, pte);
939 unlock_kernel();
940 update_mmu_cache(vma, address, *pte);
941 return;
942 no_memory:
943 oom(tsk);
947 * Simplistic page force-in..
949 void make_pages_present(unsigned long addr, unsigned long end)
951 int write;
952 struct vm_area_struct * vma;
954 vma = find_vma(current->mm, addr);
955 write = (vma->vm_flags & VM_WRITE) != 0;
956 while (addr < end) {
957 handle_mm_fault(current, vma, addr, write);
958 addr += PAGE_SIZE;