Import 2.2.7
[davej-history.git] / mm / memory.c
blob9cdfe35df226ff5efa04dceecf9ea6aa0dd14ae7
1 /*
2 * linux/mm/memory.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
36 #include <linux/mm.h>
37 #include <linux/mman.h>
38 #include <linux/swap.h>
39 #include <linux/smp_lock.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
44 unsigned long max_mapnr = 0;
45 unsigned long num_physpages = 0;
46 void * high_memory = NULL;
49 * We special-case the C-O-W ZERO_PAGE, because it's such
50 * a common occurrence (no need to read the page to know
51 * that it's zero - better for the cache and memory subsystem).
53 static inline void copy_cow_page(unsigned long from, unsigned long to)
55 if (from == ZERO_PAGE) {
56 clear_page(to);
57 return;
59 copy_page(to, from);
62 mem_map_t * mem_map = NULL;
65 * oom() prints a message (so that the user knows why the process died),
66 * and gives the process an untrappable SIGKILL.
68 void oom(struct task_struct * task)
70 printk("\nOut of memory for %s.\n", task->comm);
71 force_sig(SIGKILL, task);
75 * Note: this doesn't free the actual pages themselves. That
76 * has been handled earlier when unmapping all the memory regions.
78 static inline void free_one_pmd(pmd_t * dir)
80 pte_t * pte;
82 if (pmd_none(*dir))
83 return;
84 if (pmd_bad(*dir)) {
85 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
86 pmd_clear(dir);
87 return;
89 pte = pte_offset(dir, 0);
90 pmd_clear(dir);
91 pte_free(pte);
94 static inline void free_one_pgd(pgd_t * dir)
96 int j;
97 pmd_t * pmd;
99 if (pgd_none(*dir))
100 return;
101 if (pgd_bad(*dir)) {
102 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
103 pgd_clear(dir);
104 return;
106 pmd = pmd_offset(dir, 0);
107 pgd_clear(dir);
108 for (j = 0; j < PTRS_PER_PMD ; j++)
109 free_one_pmd(pmd+j);
110 pmd_free(pmd);
113 /* Low and high watermarks for page table cache.
114 The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
116 int pgt_cache_water[2] = { 25, 50 };
118 /* Returns the number of pages freed */
119 int check_pgt_cache(void)
121 return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
126 * This function clears all user-level page tables of a process - this
127 * is needed by execve(), so that old pages aren't in the way.
129 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
131 pgd_t * page_dir = mm->pgd;
133 if (page_dir && page_dir != swapper_pg_dir) {
134 page_dir += first;
135 do {
136 free_one_pgd(page_dir);
137 page_dir++;
138 } while (--nr);
140 /* keep the page table cache within bounds */
141 check_pgt_cache();
146 * This function just free's the page directory - the
147 * pages tables themselves have been freed earlier by
148 * clear_page_tables().
150 void free_page_tables(struct mm_struct * mm)
152 pgd_t * page_dir = mm->pgd;
154 if (page_dir) {
155 if (page_dir == swapper_pg_dir)
156 goto out_bad;
157 pgd_free(page_dir);
159 return;
161 out_bad:
162 printk(KERN_ERR
163 "free_page_tables: Trying to free kernel pgd\n");
164 return;
167 int new_page_tables(struct task_struct * tsk)
169 pgd_t * new_pg;
171 if (!(new_pg = pgd_alloc()))
172 return -ENOMEM;
173 SET_PAGE_DIR(tsk, new_pg);
174 tsk->mm->pgd = new_pg;
175 return 0;
178 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
179 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
182 * copy one vm_area from one task to the other. Assumes the page tables
183 * already present in the new task to be cleared in the whole range
184 * covered by this vma.
186 * 08Jan98 Merged into one routine from several inline routines to reduce
187 * variable count and make things faster. -jj
189 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
190 struct vm_area_struct *vma)
192 pgd_t * src_pgd, * dst_pgd;
193 unsigned long address = vma->vm_start;
194 unsigned long end = vma->vm_end;
195 unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
197 src_pgd = pgd_offset(src, address)-1;
198 dst_pgd = pgd_offset(dst, address)-1;
200 for (;;) {
201 pmd_t * src_pmd, * dst_pmd;
203 src_pgd++; dst_pgd++;
205 /* copy_pmd_range */
207 if (pgd_none(*src_pgd))
208 goto skip_copy_pmd_range;
209 if (pgd_bad(*src_pgd)) {
210 printk("copy_pmd_range: bad pgd (%08lx)\n",
211 pgd_val(*src_pgd));
212 pgd_clear(src_pgd);
213 skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
214 if (address >= end)
215 goto out;
216 continue;
218 if (pgd_none(*dst_pgd)) {
219 if (!pmd_alloc(dst_pgd, 0))
220 goto nomem;
223 src_pmd = pmd_offset(src_pgd, address);
224 dst_pmd = pmd_offset(dst_pgd, address);
226 do {
227 pte_t * src_pte, * dst_pte;
229 /* copy_pte_range */
231 if (pmd_none(*src_pmd))
232 goto skip_copy_pte_range;
233 if (pmd_bad(*src_pmd)) {
234 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
235 pmd_clear(src_pmd);
236 skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
237 if (address >= end)
238 goto out;
239 goto cont_copy_pmd_range;
241 if (pmd_none(*dst_pmd)) {
242 if (!pte_alloc(dst_pmd, 0))
243 goto nomem;
246 src_pte = pte_offset(src_pmd, address);
247 dst_pte = pte_offset(dst_pmd, address);
249 do {
250 pte_t pte = *src_pte;
251 unsigned long page_nr;
253 /* copy_one_pte */
255 if (pte_none(pte))
256 goto cont_copy_pte_range;
257 if (!pte_present(pte)) {
258 swap_duplicate(pte_val(pte));
259 set_pte(dst_pte, pte);
260 goto cont_copy_pte_range;
262 page_nr = MAP_NR(pte_page(pte));
263 if (page_nr >= max_mapnr ||
264 PageReserved(mem_map+page_nr)) {
265 set_pte(dst_pte, pte);
266 goto cont_copy_pte_range;
268 /* If it's a COW mapping, write protect it both in the parent and the child */
269 if (cow) {
270 pte = pte_wrprotect(pte);
271 set_pte(src_pte, pte);
273 /* If it's a shared mapping, mark it clean in the child */
274 if (vma->vm_flags & VM_SHARED)
275 pte = pte_mkclean(pte);
276 set_pte(dst_pte, pte_mkold(pte));
277 atomic_inc(&mem_map[page_nr].count);
279 cont_copy_pte_range: address += PAGE_SIZE;
280 if (address >= end)
281 goto out;
282 src_pte++;
283 dst_pte++;
284 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
286 cont_copy_pmd_range: src_pmd++;
287 dst_pmd++;
288 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
290 out:
291 return 0;
293 nomem:
294 return -ENOMEM;
298 * Return indicates whether a page was freed so caller can adjust rss
300 static inline int free_pte(pte_t page)
302 if (pte_present(page)) {
303 unsigned long addr = pte_page(page);
304 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
305 return 0;
307 * free_page() used to be able to clear swap cache
308 * entries. We may now have to do it manually.
310 free_page_and_swap_cache(addr);
311 return 1;
313 swap_free(pte_val(page));
314 return 0;
317 static inline void forget_pte(pte_t page)
319 if (!pte_none(page)) {
320 printk("forget_pte: old mapping existed!\n");
321 free_pte(page);
325 static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
327 pte_t * pte;
328 int freed;
330 if (pmd_none(*pmd))
331 return 0;
332 if (pmd_bad(*pmd)) {
333 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
334 pmd_clear(pmd);
335 return 0;
337 pte = pte_offset(pmd, address);
338 address &= ~PMD_MASK;
339 if (address + size > PMD_SIZE)
340 size = PMD_SIZE - address;
341 size >>= PAGE_SHIFT;
342 freed = 0;
343 for (;;) {
344 pte_t page;
345 if (!size)
346 break;
347 page = *pte;
348 pte++;
349 size--;
350 if (pte_none(page))
351 continue;
352 pte_clear(pte-1);
353 freed += free_pte(page);
355 return freed;
358 static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
360 pmd_t * pmd;
361 unsigned long end;
362 int freed;
364 if (pgd_none(*dir))
365 return 0;
366 if (pgd_bad(*dir)) {
367 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
368 pgd_clear(dir);
369 return 0;
371 pmd = pmd_offset(dir, address);
372 address &= ~PGDIR_MASK;
373 end = address + size;
374 if (end > PGDIR_SIZE)
375 end = PGDIR_SIZE;
376 freed = 0;
377 do {
378 freed += zap_pte_range(pmd, address, end - address);
379 address = (address + PMD_SIZE) & PMD_MASK;
380 pmd++;
381 } while (address < end);
382 return freed;
386 * remove user pages in a given range.
388 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
390 pgd_t * dir;
391 unsigned long end = address + size;
392 int freed = 0;
394 dir = pgd_offset(mm, address);
395 while (address < end) {
396 freed += zap_pmd_range(dir, address, end - address);
397 address = (address + PGDIR_SIZE) & PGDIR_MASK;
398 dir++;
401 * Update rss for the mm_struct (not necessarily current->mm)
403 if (mm->rss > 0) {
404 mm->rss -= freed;
405 if (mm->rss < 0)
406 mm->rss = 0;
410 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
412 unsigned long end;
414 address &= ~PMD_MASK;
415 end = address + size;
416 if (end > PMD_SIZE)
417 end = PMD_SIZE;
418 do {
419 pte_t oldpage = *pte;
420 set_pte(pte, zero_pte);
421 forget_pte(oldpage);
422 address += PAGE_SIZE;
423 pte++;
424 } while (address < end);
427 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
429 unsigned long end;
431 address &= ~PGDIR_MASK;
432 end = address + size;
433 if (end > PGDIR_SIZE)
434 end = PGDIR_SIZE;
435 do {
436 pte_t * pte = pte_alloc(pmd, address);
437 if (!pte)
438 return -ENOMEM;
439 zeromap_pte_range(pte, address, end - address, zero_pte);
440 address = (address + PMD_SIZE) & PMD_MASK;
441 pmd++;
442 } while (address < end);
443 return 0;
446 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
448 int error = 0;
449 pgd_t * dir;
450 unsigned long beg = address;
451 unsigned long end = address + size;
452 pte_t zero_pte;
454 zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
455 dir = pgd_offset(current->mm, address);
456 flush_cache_range(current->mm, beg, end);
457 while (address < end) {
458 pmd_t *pmd = pmd_alloc(dir, address);
459 error = -ENOMEM;
460 if (!pmd)
461 break;
462 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
463 if (error)
464 break;
465 address = (address + PGDIR_SIZE) & PGDIR_MASK;
466 dir++;
468 flush_tlb_range(current->mm, beg, end);
469 return error;
473 * maps a range of physical memory into the requested pages. the old
474 * mappings are removed. any references to nonexistent pages results
475 * in null mappings (currently treated as "copy-on-access")
477 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
478 unsigned long phys_addr, pgprot_t prot)
480 unsigned long end;
482 address &= ~PMD_MASK;
483 end = address + size;
484 if (end > PMD_SIZE)
485 end = PMD_SIZE;
486 do {
487 unsigned long mapnr;
488 pte_t oldpage = *pte;
489 pte_clear(pte);
491 mapnr = MAP_NR(__va(phys_addr));
492 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
493 set_pte(pte, mk_pte_phys(phys_addr, prot));
494 forget_pte(oldpage);
495 address += PAGE_SIZE;
496 phys_addr += PAGE_SIZE;
497 pte++;
498 } while (address < end);
501 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
502 unsigned long phys_addr, pgprot_t prot)
504 unsigned long end;
506 address &= ~PGDIR_MASK;
507 end = address + size;
508 if (end > PGDIR_SIZE)
509 end = PGDIR_SIZE;
510 phys_addr -= address;
511 do {
512 pte_t * pte = pte_alloc(pmd, address);
513 if (!pte)
514 return -ENOMEM;
515 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
516 address = (address + PMD_SIZE) & PMD_MASK;
517 pmd++;
518 } while (address < end);
519 return 0;
522 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
524 int error = 0;
525 pgd_t * dir;
526 unsigned long beg = from;
527 unsigned long end = from + size;
529 phys_addr -= from;
530 dir = pgd_offset(current->mm, from);
531 flush_cache_range(current->mm, beg, end);
532 while (from < end) {
533 pmd_t *pmd = pmd_alloc(dir, from);
534 error = -ENOMEM;
535 if (!pmd)
536 break;
537 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
538 if (error)
539 break;
540 from = (from + PGDIR_SIZE) & PGDIR_MASK;
541 dir++;
543 flush_tlb_range(current->mm, beg, end);
544 return error;
548 * sanity-check function..
550 static void put_page(pte_t * page_table, pte_t pte)
552 if (!pte_none(*page_table)) {
553 free_page_and_swap_cache(pte_page(pte));
554 return;
556 /* no need for flush_tlb */
557 set_pte(page_table, pte);
561 * This routine is used to map in a page into an address space: needed by
562 * execve() for the initial stack and environment pages.
564 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
566 pgd_t * pgd;
567 pmd_t * pmd;
568 pte_t * pte;
570 if (MAP_NR(page) >= max_mapnr)
571 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
572 if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
573 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
574 pgd = pgd_offset(tsk->mm,address);
575 pmd = pmd_alloc(pgd, address);
576 if (!pmd) {
577 free_page(page);
578 oom(tsk);
579 return 0;
581 pte = pte_alloc(pmd, address);
582 if (!pte) {
583 free_page(page);
584 oom(tsk);
585 return 0;
587 if (!pte_none(*pte)) {
588 printk("put_dirty_page: pte %08lx already exists\n",
589 pte_val(*pte));
590 free_page(page);
591 return 0;
593 flush_page_to_ram(page);
594 set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
595 /* no need for flush_tlb */
596 return page;
600 * This routine handles present pages, when users try to write
601 * to a shared page. It is done by copying the page to a new address
602 * and decrementing the shared-page counter for the old page.
604 * Goto-purists beware: the only reason for goto's here is that it results
605 * in better assembly code.. The "default" path will see no jumps at all.
607 * Note that this routine assumes that the protection checks have been
608 * done by the caller (the low-level page fault routine in most cases).
609 * Thus we can safely just mark it writable once we've done any necessary
610 * COW.
612 * We also mark the page dirty at this point even though the page will
613 * change only once the write actually happens. This avoids a few races,
614 * and potentially makes it more efficient.
616 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
617 unsigned long address, pte_t *page_table)
619 pte_t pte;
620 unsigned long old_page, new_page;
621 struct page * page_map;
623 pte = *page_table;
624 new_page = __get_free_page(GFP_USER);
625 /* Did someone else copy this page for us while we slept? */
626 if (pte_val(*page_table) != pte_val(pte))
627 goto end_wp_page;
628 if (!pte_present(pte))
629 goto end_wp_page;
630 if (pte_write(pte))
631 goto end_wp_page;
632 old_page = pte_page(pte);
633 if (MAP_NR(old_page) >= max_mapnr)
634 goto bad_wp_page;
635 tsk->min_flt++;
636 page_map = mem_map + MAP_NR(old_page);
639 * We can avoid the copy if:
640 * - we're the only user (count == 1)
641 * - the only other user is the swap cache,
642 * and the only swap cache user is itself,
643 * in which case we can remove the page
644 * from the swap cache.
646 switch (atomic_read(&page_map->count)) {
647 case 2:
648 if (!PageSwapCache(page_map))
649 break;
650 if (swap_count(page_map->offset) != 1)
651 break;
652 delete_from_swap_cache(page_map);
653 /* FallThrough */
654 case 1:
655 /* We can release the kernel lock now.. */
656 unlock_kernel();
658 flush_cache_page(vma, address);
659 set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
660 flush_tlb_page(vma, address);
661 end_wp_page:
662 if (new_page)
663 free_page(new_page);
664 return 1;
667 unlock_kernel();
668 if (!new_page)
669 return 0;
671 if (PageReserved(mem_map + MAP_NR(old_page)))
672 ++vma->vm_mm->rss;
673 copy_cow_page(old_page,new_page);
674 flush_page_to_ram(old_page);
675 flush_page_to_ram(new_page);
676 flush_cache_page(vma, address);
677 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
678 free_page(old_page);
679 flush_tlb_page(vma, address);
680 return 1;
682 bad_wp_page:
683 printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
684 send_sig(SIGKILL, tsk, 1);
685 if (new_page)
686 free_page(new_page);
687 return 0;
691 * This function zeroes out partial mmap'ed pages at truncation time..
693 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
695 pgd_t *page_dir;
696 pmd_t *page_middle;
697 pte_t *page_table, pte;
699 page_dir = pgd_offset(vma->vm_mm, address);
700 if (pgd_none(*page_dir))
701 return;
702 if (pgd_bad(*page_dir)) {
703 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
704 pgd_clear(page_dir);
705 return;
707 page_middle = pmd_offset(page_dir, address);
708 if (pmd_none(*page_middle))
709 return;
710 if (pmd_bad(*page_middle)) {
711 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
712 pmd_clear(page_middle);
713 return;
715 page_table = pte_offset(page_middle, address);
716 pte = *page_table;
717 if (!pte_present(pte))
718 return;
719 flush_cache_page(vma, address);
720 address &= ~PAGE_MASK;
721 address += pte_page(pte);
722 if (MAP_NR(address) >= max_mapnr)
723 return;
724 memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
725 flush_page_to_ram(pte_page(pte));
729 * Handle all mappings that got truncated by a "truncate()"
730 * system call.
732 * NOTE! We have to be ready to update the memory sharing
733 * between the file and the memory map for a potential last
734 * incomplete page. Ugly, but necessary.
736 void vmtruncate(struct inode * inode, unsigned long offset)
738 struct vm_area_struct * mpnt;
740 truncate_inode_pages(inode, offset);
741 if (!inode->i_mmap)
742 return;
743 mpnt = inode->i_mmap;
744 do {
745 struct mm_struct *mm = mpnt->vm_mm;
746 unsigned long start = mpnt->vm_start;
747 unsigned long end = mpnt->vm_end;
748 unsigned long len = end - start;
749 unsigned long diff;
751 /* mapping wholly truncated? */
752 if (mpnt->vm_offset >= offset) {
753 flush_cache_range(mm, start, end);
754 zap_page_range(mm, start, len);
755 flush_tlb_range(mm, start, end);
756 continue;
758 /* mapping wholly unaffected? */
759 diff = offset - mpnt->vm_offset;
760 if (diff >= len)
761 continue;
762 /* Ok, partially affected.. */
763 start += diff;
764 len = (len - diff) & PAGE_MASK;
765 if (start & ~PAGE_MASK) {
766 partial_clear(mpnt, start);
767 start = (start + ~PAGE_MASK) & PAGE_MASK;
769 flush_cache_range(mm, start, end);
770 zap_page_range(mm, start, len);
771 flush_tlb_range(mm, start, end);
772 } while ((mpnt = mpnt->vm_next_share) != NULL);
777 * This is called with the kernel lock held, we need
778 * to return without it.
780 static int do_swap_page(struct task_struct * tsk,
781 struct vm_area_struct * vma, unsigned long address,
782 pte_t * page_table, pte_t entry, int write_access)
784 if (!vma->vm_ops || !vma->vm_ops->swapin) {
785 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
786 flush_page_to_ram(pte_page(*page_table));
787 } else {
788 pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
789 if (pte_val(*page_table) != pte_val(entry)) {
790 free_page(pte_page(page));
791 } else {
792 if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
793 !(vma->vm_flags & VM_SHARED))
794 page = pte_wrprotect(page);
795 ++vma->vm_mm->rss;
796 ++tsk->maj_flt;
797 flush_page_to_ram(pte_page(page));
798 set_pte(page_table, page);
801 unlock_kernel();
802 return 1;
806 * This only needs the MM semaphore
808 static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access)
810 pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
811 if (write_access) {
812 unsigned long page = __get_free_page(GFP_USER);
813 if (!page)
814 return 0;
815 clear_page(page);
816 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
817 vma->vm_mm->rss++;
818 tsk->min_flt++;
819 flush_page_to_ram(page);
821 put_page(page_table, entry);
822 return 1;
826 * do_no_page() tries to create a new page mapping. It aggressively
827 * tries to share with existing pages, but makes a separate copy if
828 * the "write_access" parameter is true in order to avoid the next
829 * page fault.
831 * As this is called only for pages that do not currently exist, we
832 * do not need to flush old virtual caches or the TLB.
834 * This is called with the MM semaphore and the kernel lock held.
835 * We need to release the kernel lock as soon as possible..
837 static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
838 unsigned long address, int write_access, pte_t *page_table)
840 unsigned long page;
841 pte_t entry;
843 if (!vma->vm_ops || !vma->vm_ops->nopage) {
844 unlock_kernel();
845 return do_anonymous_page(tsk, vma, page_table, write_access);
849 * The third argument is "no_share", which tells the low-level code
850 * to copy, not share the page even if sharing is possible. It's
851 * essentially an early COW detection.
853 page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
854 (vma->vm_flags & VM_SHARED)?0:write_access);
856 unlock_kernel();
857 if (!page)
858 return 0;
860 ++tsk->maj_flt;
861 ++vma->vm_mm->rss;
863 * This silly early PAGE_DIRTY setting removes a race
864 * due to the bad i386 page protection. But it's valid
865 * for other architectures too.
867 * Note that if write_access is true, we either now have
868 * an exclusive copy of the page, or this is a shared mapping,
869 * so we can make it writable and dirty to avoid having to
870 * handle that later.
872 flush_page_to_ram(page);
873 entry = mk_pte(page, vma->vm_page_prot);
874 if (write_access) {
875 entry = pte_mkwrite(pte_mkdirty(entry));
876 } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
877 !(vma->vm_flags & VM_SHARED))
878 entry = pte_wrprotect(entry);
879 put_page(page_table, entry);
880 /* no need to invalidate: a not-present page shouldn't be cached */
881 return 1;
885 * These routines also need to handle stuff like marking pages dirty
886 * and/or accessed for architectures that don't do it in hardware (most
887 * RISC architectures). The early dirtying is also good on the i386.
889 * There is also a hook called "update_mmu_cache()" that architectures
890 * with external mmu caches can use to update those (ie the Sparc or
891 * PowerPC hashed page tables that act as extended TLBs).
893 static inline int handle_pte_fault(struct task_struct *tsk,
894 struct vm_area_struct * vma, unsigned long address,
895 int write_access, pte_t * pte)
897 pte_t entry;
899 lock_kernel();
900 entry = *pte;
902 if (!pte_present(entry)) {
903 if (pte_none(entry))
904 return do_no_page(tsk, vma, address, write_access, pte);
905 return do_swap_page(tsk, vma, address, pte, entry, write_access);
908 entry = pte_mkyoung(entry);
909 set_pte(pte, entry);
910 flush_tlb_page(vma, address);
911 if (write_access) {
912 if (!pte_write(entry))
913 return do_wp_page(tsk, vma, address, pte);
915 entry = pte_mkdirty(entry);
916 set_pte(pte, entry);
917 flush_tlb_page(vma, address);
919 unlock_kernel();
920 return 1;
924 * By the time we get here, we already hold the mm semaphore
926 int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
927 unsigned long address, int write_access)
929 pgd_t *pgd;
930 pmd_t *pmd;
932 pgd = pgd_offset(vma->vm_mm, address);
933 pmd = pmd_alloc(pgd, address);
934 if (pmd) {
935 pte_t * pte = pte_alloc(pmd, address);
936 if (pte) {
937 if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
938 update_mmu_cache(vma, address, *pte);
939 return 1;
943 return 0;
947 * Simplistic page force-in..
949 void make_pages_present(unsigned long addr, unsigned long end)
951 int write;
952 struct vm_area_struct * vma;
954 vma = find_vma(current->mm, addr);
955 write = (vma->vm_flags & VM_WRITE) != 0;
956 while (addr < end) {
957 handle_mm_fault(current, vma, addr, write);
958 addr += PAGE_SIZE;