>> Btw, I've been looking at why Andrea thinks he's patches are needed,
[davej-history.git] / mm / memory.c
blob9cfa87406667a2691ca071a000cf5854357be961
1 /*
2 * linux/mm/memory.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
36 #include <linux/signal.h>
37 #include <linux/sched.h>
38 #include <linux/kernel.h>
39 #include <linux/errno.h>
40 #include <linux/string.h>
41 #include <linux/types.h>
42 #include <linux/ptrace.h>
43 #include <linux/mman.h>
44 #include <linux/mm.h>
45 #include <linux/swap.h>
46 #include <linux/smp.h>
47 #include <linux/smp_lock.h>
49 #include <asm/system.h>
50 #include <asm/uaccess.h>
51 #include <asm/pgtable.h>
52 #include <asm/string.h>
54 unsigned long max_mapnr = 0;
55 unsigned long num_physpages = 0;
56 void * high_memory = NULL;
59 * We special-case the C-O-W ZERO_PAGE, because it's such
60 * a common occurrence (no need to read the page to know
61 * that it's zero - better for the cache and memory subsystem).
63 static inline void copy_cow_page(unsigned long from, unsigned long to)
65 if (from == ZERO_PAGE) {
66 clear_page(to);
67 return;
69 copy_page(to, from);
72 mem_map_t * mem_map = NULL;
75 * oom() prints a message (so that the user knows why the process died),
76 * and gives the process an untrappable SIGKILL.
78 void oom(struct task_struct * task)
80 printk("\nOut of memory for %s.\n", task->comm);
81 force_sig(SIGKILL, task);
85 * Note: this doesn't free the actual pages themselves. That
86 * has been handled earlier when unmapping all the memory regions.
88 static inline void free_one_pmd(pmd_t * dir)
90 pte_t * pte;
92 if (pmd_none(*dir))
93 return;
94 if (pmd_bad(*dir)) {
95 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
96 pmd_clear(dir);
97 return;
99 pte = pte_offset(dir, 0);
100 pmd_clear(dir);
101 pte_free(pte);
104 static inline void free_one_pgd(pgd_t * dir)
106 int j;
107 pmd_t * pmd;
109 if (pgd_none(*dir))
110 return;
111 if (pgd_bad(*dir)) {
112 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
113 pgd_clear(dir);
114 return;
116 pmd = pmd_offset(dir, 0);
117 pgd_clear(dir);
118 for (j = 0; j < PTRS_PER_PMD ; j++)
119 free_one_pmd(pmd+j);
120 pmd_free(pmd);
123 /* Low and high watermarks for page table cache.
124 The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
126 int pgt_cache_water[2] = { 25, 50 };
128 /* Returns the number of pages freed */
129 int check_pgt_cache(void)
131 return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
136 * This function clears all user-level page tables of a process - this
137 * is needed by execve(), so that old pages aren't in the way.
139 void clear_page_tables(struct task_struct * tsk)
141 pgd_t * page_dir = tsk->mm->pgd;
142 int i;
144 if (!page_dir || page_dir == swapper_pg_dir)
145 goto out_bad;
146 for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
147 free_one_pgd(page_dir + i);
149 /* keep the page table cache within bounds */
150 check_pgt_cache();
151 return;
153 out_bad:
154 printk(KERN_ERR
155 "clear_page_tables: %s trying to clear kernel pgd\n",
156 tsk->comm);
157 return;
161 * This function frees up all page tables of a process when it exits. It
162 * is the same as "clear_page_tables()", except it also frees the old
163 * page table directory.
165 void free_page_tables(struct mm_struct * mm)
167 pgd_t * page_dir = mm->pgd;
168 int i;
170 if (!page_dir)
171 goto out;
172 if (page_dir == swapper_pg_dir)
173 goto out_bad;
174 for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
175 free_one_pgd(page_dir + i);
176 pgd_free(page_dir);
178 /* keep the page table cache within bounds */
179 check_pgt_cache();
180 out:
181 return;
183 out_bad:
184 printk(KERN_ERR
185 "free_page_tables: Trying to free kernel pgd\n");
186 return;
189 int new_page_tables(struct task_struct * tsk)
191 pgd_t * new_pg;
193 if (!(new_pg = pgd_alloc()))
194 return -ENOMEM;
195 SET_PAGE_DIR(tsk, new_pg);
196 tsk->mm->pgd = new_pg;
197 return 0;
200 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
201 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
204 * copy one vm_area from one task to the other. Assumes the page tables
205 * already present in the new task to be cleared in the whole range
206 * covered by this vma.
208 * 08Jan98 Merged into one routine from several inline routines to reduce
209 * variable count and make things faster. -jj
211 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
212 struct vm_area_struct *vma)
214 pgd_t * src_pgd, * dst_pgd;
215 unsigned long address = vma->vm_start;
216 unsigned long end = vma->vm_end;
217 unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
219 src_pgd = pgd_offset(src, address)-1;
220 dst_pgd = pgd_offset(dst, address)-1;
222 for (;;) {
223 pmd_t * src_pmd, * dst_pmd;
225 src_pgd++; dst_pgd++;
227 /* copy_pmd_range */
229 if (pgd_none(*src_pgd))
230 goto skip_copy_pmd_range;
231 if (pgd_bad(*src_pgd)) {
232 printk("copy_pmd_range: bad pgd (%08lx)\n",
233 pgd_val(*src_pgd));
234 pgd_clear(src_pgd);
235 skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
236 if (address >= end)
237 goto out;
238 continue;
240 if (pgd_none(*dst_pgd)) {
241 if (!pmd_alloc(dst_pgd, 0))
242 goto nomem;
245 src_pmd = pmd_offset(src_pgd, address);
246 dst_pmd = pmd_offset(dst_pgd, address);
248 do {
249 pte_t * src_pte, * dst_pte;
251 /* copy_pte_range */
253 if (pmd_none(*src_pmd))
254 goto skip_copy_pte_range;
255 if (pmd_bad(*src_pmd)) {
256 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
257 pmd_clear(src_pmd);
258 skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
259 if (address >= end)
260 goto out;
261 goto cont_copy_pmd_range;
263 if (pmd_none(*dst_pmd)) {
264 if (!pte_alloc(dst_pmd, 0))
265 goto nomem;
268 src_pte = pte_offset(src_pmd, address);
269 dst_pte = pte_offset(dst_pmd, address);
271 do {
272 pte_t pte = *src_pte;
273 unsigned long page_nr;
275 /* copy_one_pte */
277 if (pte_none(pte))
278 goto cont_copy_pte_range;
279 if (!pte_present(pte)) {
280 swap_duplicate(pte_val(pte));
281 set_pte(dst_pte, pte);
282 goto cont_copy_pte_range;
284 page_nr = MAP_NR(pte_page(pte));
285 if (page_nr >= max_mapnr ||
286 PageReserved(mem_map+page_nr)) {
287 set_pte(dst_pte, pte);
288 goto cont_copy_pte_range;
290 if (cow)
291 pte = pte_wrprotect(pte);
292 set_pte(dst_pte, pte_mkold(pte));
293 set_pte(src_pte, pte);
294 atomic_inc(&mem_map[page_nr].count);
296 cont_copy_pte_range: address += PAGE_SIZE;
297 if (address >= end)
298 goto out;
299 src_pte++;
300 dst_pte++;
301 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
303 cont_copy_pmd_range: src_pmd++;
304 dst_pmd++;
305 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
307 out:
308 return 0;
310 nomem:
311 return -ENOMEM;
315 * Return indicates whether a page was freed so caller can adjust rss
317 static inline int free_pte(pte_t page)
319 if (pte_present(page)) {
320 unsigned long addr = pte_page(page);
321 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
322 return 0;
324 * free_page() used to be able to clear swap cache
325 * entries. We may now have to do it manually.
327 free_page_and_swap_cache(addr);
328 return 1;
330 swap_free(pte_val(page));
331 return 0;
334 static inline void forget_pte(pte_t page)
336 if (!pte_none(page)) {
337 printk("forget_pte: old mapping existed!\n");
338 free_pte(page);
342 static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
344 pte_t * pte;
345 int freed;
347 if (pmd_none(*pmd))
348 return 0;
349 if (pmd_bad(*pmd)) {
350 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
351 pmd_clear(pmd);
352 return 0;
354 pte = pte_offset(pmd, address);
355 address &= ~PMD_MASK;
356 if (address + size > PMD_SIZE)
357 size = PMD_SIZE - address;
358 size >>= PAGE_SHIFT;
359 freed = 0;
360 for (;;) {
361 pte_t page;
362 if (!size)
363 break;
364 page = *pte;
365 pte++;
366 size--;
367 if (pte_none(page))
368 continue;
369 pte_clear(pte-1);
370 freed += free_pte(page);
372 return freed;
375 static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
377 pmd_t * pmd;
378 unsigned long end;
379 int freed;
381 if (pgd_none(*dir))
382 return 0;
383 if (pgd_bad(*dir)) {
384 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
385 pgd_clear(dir);
386 return 0;
388 pmd = pmd_offset(dir, address);
389 address &= ~PGDIR_MASK;
390 end = address + size;
391 if (end > PGDIR_SIZE)
392 end = PGDIR_SIZE;
393 freed = 0;
394 do {
395 freed += zap_pte_range(pmd, address, end - address);
396 address = (address + PMD_SIZE) & PMD_MASK;
397 pmd++;
398 } while (address < end);
399 return freed;
403 * remove user pages in a given range.
405 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
407 pgd_t * dir;
408 unsigned long end = address + size;
409 int freed = 0;
411 dir = pgd_offset(mm, address);
412 while (address < end) {
413 freed += zap_pmd_range(dir, address, end - address);
414 address = (address + PGDIR_SIZE) & PGDIR_MASK;
415 dir++;
418 * Update rss for the mm_struct (not necessarily current->mm)
420 if (mm->rss > 0) {
421 mm->rss -= freed;
422 if (mm->rss < 0)
423 mm->rss = 0;
427 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
429 unsigned long end;
431 address &= ~PMD_MASK;
432 end = address + size;
433 if (end > PMD_SIZE)
434 end = PMD_SIZE;
435 do {
436 pte_t oldpage = *pte;
437 set_pte(pte, zero_pte);
438 forget_pte(oldpage);
439 address += PAGE_SIZE;
440 pte++;
441 } while (address < end);
444 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
446 unsigned long end;
448 address &= ~PGDIR_MASK;
449 end = address + size;
450 if (end > PGDIR_SIZE)
451 end = PGDIR_SIZE;
452 do {
453 pte_t * pte = pte_alloc(pmd, address);
454 if (!pte)
455 return -ENOMEM;
456 zeromap_pte_range(pte, address, end - address, zero_pte);
457 address = (address + PMD_SIZE) & PMD_MASK;
458 pmd++;
459 } while (address < end);
460 return 0;
463 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
465 int error = 0;
466 pgd_t * dir;
467 unsigned long beg = address;
468 unsigned long end = address + size;
469 pte_t zero_pte;
471 zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
472 dir = pgd_offset(current->mm, address);
473 flush_cache_range(current->mm, beg, end);
474 while (address < end) {
475 pmd_t *pmd = pmd_alloc(dir, address);
476 error = -ENOMEM;
477 if (!pmd)
478 break;
479 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
480 if (error)
481 break;
482 address = (address + PGDIR_SIZE) & PGDIR_MASK;
483 dir++;
485 flush_tlb_range(current->mm, beg, end);
486 return error;
490 * maps a range of physical memory into the requested pages. the old
491 * mappings are removed. any references to nonexistent pages results
492 * in null mappings (currently treated as "copy-on-access")
494 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
495 unsigned long phys_addr, pgprot_t prot)
497 unsigned long end;
499 address &= ~PMD_MASK;
500 end = address + size;
501 if (end > PMD_SIZE)
502 end = PMD_SIZE;
503 do {
504 unsigned long mapnr;
505 pte_t oldpage = *pte;
506 pte_clear(pte);
508 mapnr = MAP_NR(__va(phys_addr));
509 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
510 set_pte(pte, mk_pte_phys(phys_addr, prot));
511 forget_pte(oldpage);
512 address += PAGE_SIZE;
513 phys_addr += PAGE_SIZE;
514 pte++;
515 } while (address < end);
518 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
519 unsigned long phys_addr, pgprot_t prot)
521 unsigned long end;
523 address &= ~PGDIR_MASK;
524 end = address + size;
525 if (end > PGDIR_SIZE)
526 end = PGDIR_SIZE;
527 phys_addr -= address;
528 do {
529 pte_t * pte = pte_alloc(pmd, address);
530 if (!pte)
531 return -ENOMEM;
532 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
533 address = (address + PMD_SIZE) & PMD_MASK;
534 pmd++;
535 } while (address < end);
536 return 0;
539 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
541 int error = 0;
542 pgd_t * dir;
543 unsigned long beg = from;
544 unsigned long end = from + size;
546 phys_addr -= from;
547 dir = pgd_offset(current->mm, from);
548 flush_cache_range(current->mm, beg, end);
549 while (from < end) {
550 pmd_t *pmd = pmd_alloc(dir, from);
551 error = -ENOMEM;
552 if (!pmd)
553 break;
554 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
555 if (error)
556 break;
557 from = (from + PGDIR_SIZE) & PGDIR_MASK;
558 dir++;
560 flush_tlb_range(current->mm, beg, end);
561 return error;
565 * sanity-check function..
567 static void put_page(pte_t * page_table, pte_t pte)
569 if (!pte_none(*page_table)) {
570 free_page_and_swap_cache(pte_page(pte));
571 return;
573 /* no need for flush_tlb */
574 set_pte(page_table, pte);
578 * This routine is used to map in a page into an address space: needed by
579 * execve() for the initial stack and environment pages.
581 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
583 pgd_t * pgd;
584 pmd_t * pmd;
585 pte_t * pte;
587 if (MAP_NR(page) >= max_mapnr)
588 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
589 if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
590 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
591 pgd = pgd_offset(tsk->mm,address);
592 pmd = pmd_alloc(pgd, address);
593 if (!pmd) {
594 free_page(page);
595 oom(tsk);
596 return 0;
598 pte = pte_alloc(pmd, address);
599 if (!pte) {
600 free_page(page);
601 oom(tsk);
602 return 0;
604 if (!pte_none(*pte)) {
605 printk("put_dirty_page: page already exists\n");
606 free_page(page);
607 return 0;
609 flush_page_to_ram(page);
610 set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
611 /* no need for flush_tlb */
612 return page;
616 * This routine handles present pages, when users try to write
617 * to a shared page. It is done by copying the page to a new address
618 * and decrementing the shared-page counter for the old page.
620 * Goto-purists beware: the only reason for goto's here is that it results
621 * in better assembly code.. The "default" path will see no jumps at all.
623 * Note that this routine assumes that the protection checks have been
624 * done by the caller (the low-level page fault routine in most cases).
625 * Thus we can safely just mark it writable once we've done any necessary
626 * COW.
628 * We also mark the page dirty at this point even though the page will
629 * change only once the write actually happens. This avoids a few races,
630 * and potentially makes it more efficient.
632 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
633 unsigned long address, pte_t *page_table)
635 pte_t pte;
636 unsigned long old_page, new_page;
637 struct page * page_map;
639 pte = *page_table;
640 new_page = __get_free_page(GFP_USER);
641 /* Did someone else copy this page for us while we slept? */
642 if (pte_val(*page_table) != pte_val(pte))
643 goto end_wp_page;
644 if (!pte_present(pte))
645 goto end_wp_page;
646 if (pte_write(pte))
647 goto end_wp_page;
648 old_page = pte_page(pte);
649 if (MAP_NR(old_page) >= max_mapnr)
650 goto bad_wp_page;
651 tsk->min_flt++;
652 page_map = mem_map + MAP_NR(old_page);
655 * Do we need to copy?
657 if (is_page_shared(page_map)) {
658 if (new_page) {
659 if (PageReserved(mem_map + MAP_NR(old_page)))
660 ++vma->vm_mm->rss;
661 copy_cow_page(old_page,new_page);
662 flush_page_to_ram(old_page);
663 flush_page_to_ram(new_page);
664 flush_cache_page(vma, address);
665 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
666 free_page(old_page);
667 flush_tlb_page(vma, address);
668 return 1;
670 flush_cache_page(vma, address);
671 set_pte(page_table, BAD_PAGE);
672 flush_tlb_page(vma, address);
673 free_page(old_page);
674 oom(tsk);
675 return 0;
677 if (PageSwapCache(page_map))
678 delete_from_swap_cache(page_map);
679 flush_cache_page(vma, address);
680 set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
681 flush_tlb_page(vma, address);
682 end_wp_page:
683 if (new_page)
684 free_page(new_page);
685 return 1;
687 bad_wp_page:
688 printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
689 send_sig(SIGKILL, tsk, 1);
690 if (new_page)
691 free_page(new_page);
692 return 0;
696 * This function zeroes out partial mmap'ed pages at truncation time..
698 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
700 pgd_t *page_dir;
701 pmd_t *page_middle;
702 pte_t *page_table, pte;
704 page_dir = pgd_offset(vma->vm_mm, address);
705 if (pgd_none(*page_dir))
706 return;
707 if (pgd_bad(*page_dir)) {
708 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
709 pgd_clear(page_dir);
710 return;
712 page_middle = pmd_offset(page_dir, address);
713 if (pmd_none(*page_middle))
714 return;
715 if (pmd_bad(*page_middle)) {
716 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
717 pmd_clear(page_middle);
718 return;
720 page_table = pte_offset(page_middle, address);
721 pte = *page_table;
722 if (!pte_present(pte))
723 return;
724 flush_cache_page(vma, address);
725 address &= ~PAGE_MASK;
726 address += pte_page(pte);
727 if (MAP_NR(address) >= max_mapnr)
728 return;
729 memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
730 flush_page_to_ram(pte_page(pte));
734 * Handle all mappings that got truncated by a "truncate()"
735 * system call.
737 * NOTE! We have to be ready to update the memory sharing
738 * between the file and the memory map for a potential last
739 * incomplete page. Ugly, but necessary.
741 void vmtruncate(struct inode * inode, unsigned long offset)
743 struct vm_area_struct * mpnt;
745 truncate_inode_pages(inode, offset);
746 if (!inode->i_mmap)
747 return;
748 mpnt = inode->i_mmap;
749 do {
750 struct mm_struct *mm = mpnt->vm_mm;
751 unsigned long start = mpnt->vm_start;
752 unsigned long end = mpnt->vm_end;
753 unsigned long len = end - start;
754 unsigned long diff;
756 /* mapping wholly truncated? */
757 if (mpnt->vm_offset >= offset) {
758 flush_cache_range(mm, start, end);
759 zap_page_range(mm, start, len);
760 flush_tlb_range(mm, start, end);
761 continue;
763 /* mapping wholly unaffected? */
764 diff = offset - mpnt->vm_offset;
765 if (diff >= len)
766 continue;
767 /* Ok, partially affected.. */
768 start += diff;
769 len = (len - diff) & PAGE_MASK;
770 if (start & ~PAGE_MASK) {
771 partial_clear(mpnt, start);
772 start = (start + ~PAGE_MASK) & PAGE_MASK;
774 flush_cache_range(mm, start, end);
775 zap_page_range(mm, start, len);
776 flush_tlb_range(mm, start, end);
777 } while ((mpnt = mpnt->vm_next_share) != NULL);
781 static int do_swap_page(struct task_struct * tsk,
782 struct vm_area_struct * vma, unsigned long address,
783 pte_t * page_table, pte_t entry, int write_access)
785 lock_kernel();
786 if (!vma->vm_ops || !vma->vm_ops->swapin) {
787 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
788 flush_page_to_ram(pte_page(*page_table));
789 } else {
790 pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
791 if (pte_val(*page_table) != pte_val(entry)) {
792 free_page(pte_page(page));
793 } else {
794 if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
795 !(vma->vm_flags & VM_SHARED))
796 page = pte_wrprotect(page);
797 ++vma->vm_mm->rss;
798 ++tsk->maj_flt;
799 flush_page_to_ram(pte_page(page));
800 set_pte(page_table, page);
803 unlock_kernel();
804 return 1;
808 * This only needs the MM semaphore
810 static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access)
812 pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
813 if (write_access) {
814 unsigned long page = __get_free_page(GFP_USER);
815 if (!page)
816 return 0;
817 clear_page(page);
818 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
819 vma->vm_mm->rss++;
820 tsk->min_flt++;
821 flush_page_to_ram(page);
823 put_page(page_table, entry);
824 return 1;
828 * do_no_page() tries to create a new page mapping. It aggressively
829 * tries to share with existing pages, but makes a separate copy if
830 * the "write_access" parameter is true in order to avoid the next
831 * page fault.
833 * As this is called only for pages that do not currently exist, we
834 * do not need to flush old virtual caches or the TLB.
836 * This is called with the MM semaphore held, but without the kernel
837 * lock.
839 static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
840 unsigned long address, int write_access, pte_t *page_table)
842 unsigned long page;
843 pte_t entry;
845 if (!vma->vm_ops || !vma->vm_ops->nopage)
846 return do_anonymous_page(tsk, vma, page_table, write_access);
849 * The third argument is "no_share", which tells the low-level code
850 * to copy, not share the page even if sharing is possible. It's
851 * essentially an early COW detection.
853 * We need to grab the kernel lock for this..
855 lock_kernel();
856 page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
857 (vma->vm_flags & VM_SHARED)?0:write_access);
858 unlock_kernel();
859 if (!page)
860 return 0;
862 ++tsk->maj_flt;
863 ++vma->vm_mm->rss;
865 * This silly early PAGE_DIRTY setting removes a race
866 * due to the bad i386 page protection. But it's valid
867 * for other architectures too.
869 * Note that if write_access is true, we either now have
870 * an exclusive copy of the page, or this is a shared mapping,
871 * so we can make it writable and dirty to avoid having to
872 * handle that later.
874 flush_page_to_ram(page);
875 entry = mk_pte(page, vma->vm_page_prot);
876 if (write_access) {
877 entry = pte_mkwrite(pte_mkdirty(entry));
878 } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
879 !(vma->vm_flags & VM_SHARED))
880 entry = pte_wrprotect(entry);
881 put_page(page_table, entry);
882 /* no need to invalidate: a not-present page shouldn't be cached */
883 return 1;
887 * These routines also need to handle stuff like marking pages dirty
888 * and/or accessed for architectures that don't do it in hardware (most
889 * RISC architectures). The early dirtying is also good on the i386.
891 * There is also a hook called "update_mmu_cache()" that architectures
892 * with external mmu caches can use to update those (ie the Sparc or
893 * PowerPC hashed page tables that act as extended TLBs).
895 static inline int handle_pte_fault(struct task_struct *tsk,
896 struct vm_area_struct * vma, unsigned long address,
897 int write_access, pte_t * pte)
899 pte_t entry = *pte;
901 if (!pte_present(entry)) {
902 if (pte_none(entry))
903 return do_no_page(tsk, vma, address, write_access, pte);
904 return do_swap_page(tsk, vma, address, pte, entry, write_access);
907 entry = pte_mkyoung(entry);
908 set_pte(pte, entry);
909 flush_tlb_page(vma, address);
910 if (!write_access)
911 return 1;
913 if (pte_write(entry)) {
914 entry = pte_mkdirty(entry);
915 set_pte(pte, entry);
916 flush_tlb_page(vma, address);
917 return 1;
919 return do_wp_page(tsk, vma, address, pte);
923 * By the time we get here, we already hold the mm semaphore
925 int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
926 unsigned long address, int write_access)
928 pgd_t *pgd;
929 pmd_t *pmd;
931 pgd = pgd_offset(vma->vm_mm, address);
932 pmd = pmd_alloc(pgd, address);
933 if (pmd) {
934 pte_t * pte = pte_alloc(pmd, address);
935 if (pte) {
936 if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
937 update_mmu_cache(vma, address, *pte);
938 return 1;
942 return 0;
946 * Simplistic page force-in..
948 void make_pages_present(unsigned long addr, unsigned long end)
950 int write;
951 struct vm_area_struct * vma;
953 vma = find_vma(current->mm, addr);
954 write = (vma->vm_flags & VM_WRITE) != 0;
955 while (addr < end) {
956 handle_mm_fault(current, vma, addr, write);
957 addr += PAGE_SIZE;