mm/memory.c

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/kernel.h>
  39 #include <linux/errno.h>
  40 #include <linux/string.h>
  41 #include <linux/types.h>
  42 #include <linux/ptrace.h>
  43 #include <linux/mman.h>
  44 #include <linux/mm.h>
  45 #include <linux/swap.h>
  46 #include <linux/smp.h>
  47 #include <linux/smp_lock.h>
  48
  49 #include <asm/system.h>
  50 #include <asm/uaccess.h>
  51 #include <asm/pgtable.h>
  52 #include <asm/string.h>
  53
  54 unsigned long max_mapnr = 0;
  55 unsigned long num_physpages = 0;
  56 void * high_memory = NULL;
  57
  58 /*
  59  * We special-case the C-O-W ZERO_PAGE, because it's such
  60  * a common occurrence (no need to read the page to know
  61  * that it's zero - better for the cache and memory subsystem).
  62  */
  63 static inline void copy_cow_page(unsigned long from, unsigned long to)
  64 {
  65         if (from == ZERO_PAGE) {
  66                 clear_page(to);
  67                 return;
  68         }
  69         copy_page(to, from);
  70 }
  71
  72 mem_map_t * mem_map = NULL;
  73
  74 /*
  75  * oom() prints a message (so that the user knows why the process died),
  76  * and gives the process an untrappable SIGKILL.
  77  */
  78 void oom(struct task_struct * task)
  79 {
  80         printk("\nOut of memory for %s.\n", task->comm);
  81         force_sig(SIGKILL, task);
  82 }
  83
  84 /*
  85  * Note: this doesn't free the actual pages themselves. That
  86  * has been handled earlier when unmapping all the memory regions.
  87  */
  88 static inline void free_one_pmd(pmd_t * dir)
  89 {
  90         pte_t * pte;
  91
  92         if (pmd_none(*dir))
  93                 return;
  94         if (pmd_bad(*dir)) {
  95                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  96                 pmd_clear(dir);
  97                 return;
  98         }
  99         pte = pte_offset(dir, 0);
 100         pmd_clear(dir);
 101         pte_free(pte);
 102 }
 103
 104 static inline void free_one_pgd(pgd_t * dir)
 105 {
 106         int j;
 107         pmd_t * pmd;
 108
 109         if (pgd_none(*dir))
 110                 return;
 111         if (pgd_bad(*dir)) {
 112                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 113                 pgd_clear(dir);
 114                 return;
 115         }
 116         pmd = pmd_offset(dir, 0);
 117         pgd_clear(dir);
 118         for (j = 0; j < PTRS_PER_PMD ; j++)
 119                 free_one_pmd(pmd+j);
 120         pmd_free(pmd);
 121 }
 122
 123 /* Low and high watermarks for page table cache.
 124    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
 125  */
 126 int pgt_cache_water[2] = { 25, 50 };
 127
 128 /* Returns the number of pages freed */
 129 int check_pgt_cache(void)
 130 {
 131         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 132 }
 133
 134
 135 /*
 136  * This function clears all user-level page tables of a process - this
 137  * is needed by execve(), so that old pages aren't in the way.
 138  */
 139 void clear_page_tables(struct task_struct * tsk)
 140 {
 141         pgd_t * page_dir = tsk->mm->pgd;
 142         int i;
 143
 144         if (!page_dir || page_dir == swapper_pg_dir)
 145                 goto out_bad;
 146         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 147                 free_one_pgd(page_dir + i);
 148
 149         /* keep the page table cache within bounds */
 150         check_pgt_cache();
 151         return;
 152
 153 out_bad:
 154         printk(KERN_ERR
 155                 "clear_page_tables: %s trying to clear kernel pgd\n",
 156                 tsk->comm);
 157         return;
 158 }
 159
 160 /*
 161  * This function frees up all page tables of a process when it exits. It
 162  * is the same as "clear_page_tables()", except it also frees the old
 163  * page table directory.
 164  */
 165 void free_page_tables(struct mm_struct * mm)
 166 {
 167         pgd_t * page_dir = mm->pgd;
 168         int i;
 169
 170         if (!page_dir)
 171                 goto out;
 172         if (page_dir == swapper_pg_dir)
 173                 goto out_bad;
 174         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 175                 free_one_pgd(page_dir + i);
 176         pgd_free(page_dir);
 177
 178         /* keep the page table cache within bounds */
 179         check_pgt_cache();
 180 out:
 181         return;
 182
 183 out_bad:
 184         printk(KERN_ERR
 185                 "free_page_tables: Trying to free kernel pgd\n");
 186         return;
 187 }
 188
 189 int new_page_tables(struct task_struct * tsk)
 190 {
 191         pgd_t * new_pg;
 192
 193         if (!(new_pg = pgd_alloc()))
 194                 return -ENOMEM;
 195         SET_PAGE_DIR(tsk, new_pg);
 196         tsk->mm->pgd = new_pg;
 197         return 0;
 198 }
 199
 200 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 201 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 202
 203 /*
 204  * copy one vm_area from one task to the other. Assumes the page tables
 205  * already present in the new task to be cleared in the whole range
 206  * covered by this vma.
 207  *
 208  * 08Jan98 Merged into one routine from several inline routines to reduce
 209  *         variable count and make things faster. -jj
 210  */
 211 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 212                         struct vm_area_struct *vma)
 213 {
 214         pgd_t * src_pgd, * dst_pgd;
 215         unsigned long address = vma->vm_start;
 216         unsigned long end = vma->vm_end;
 217         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 218
 219         src_pgd = pgd_offset(src, address)-1;
 220         dst_pgd = pgd_offset(dst, address)-1;
 221
 222         for (;;) {
 223                 pmd_t * src_pmd, * dst_pmd;
 224
 225                 src_pgd++; dst_pgd++;
 226
 227                 /* copy_pmd_range */
 228
 229                 if (pgd_none(*src_pgd))
 230                         goto skip_copy_pmd_range;
 231                 if (pgd_bad(*src_pgd)) {
 232                         printk("copy_pmd_range: bad pgd (%08lx)\n",
 233                                 pgd_val(*src_pgd));
 234                         pgd_clear(src_pgd);
 235 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 236                         if (address >= end)
 237                                 goto out;
 238                         continue;
 239                 }
 240                 if (pgd_none(*dst_pgd)) {
 241                         if (!pmd_alloc(dst_pgd, 0))
 242                                 goto nomem;
 243                 }
 244
 245                 src_pmd = pmd_offset(src_pgd, address);
 246                 dst_pmd = pmd_offset(dst_pgd, address);
 247
 248                 do {
 249                         pte_t * src_pte, * dst_pte;
 250
 251                         /* copy_pte_range */
 252
 253                         if (pmd_none(*src_pmd))
 254                                 goto skip_copy_pte_range;
 255                         if (pmd_bad(*src_pmd)) {
 256                                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 257                                 pmd_clear(src_pmd);
 258 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 259                                 if (address >= end)
 260                                         goto out;
 261                                 goto cont_copy_pmd_range;
 262                         }
 263                         if (pmd_none(*dst_pmd)) {
 264                                 if (!pte_alloc(dst_pmd, 0))
 265                                         goto nomem;
 266                         }
 267
 268                         src_pte = pte_offset(src_pmd, address);
 269                         dst_pte = pte_offset(dst_pmd, address);
 270
 271                         do {
 272                                 pte_t pte = *src_pte;
 273                                 unsigned long page_nr;
 274
 275                                 /* copy_one_pte */
 276
 277                                 if (pte_none(pte))
 278                                         goto cont_copy_pte_range;
 279                                 if (!pte_present(pte)) {
 280                                         swap_duplicate(pte_val(pte));
 281                                         set_pte(dst_pte, pte);
 282                                         goto cont_copy_pte_range;
 283                                 }
 284                                 page_nr = MAP_NR(pte_page(pte));
 285                                 if (page_nr >= max_mapnr ||
 286                                     PageReserved(mem_map+page_nr)) {
 287                                         set_pte(dst_pte, pte);
 288                                         goto cont_copy_pte_range;
 289                                 }
 290                                 if (cow)
 291                                         pte = pte_wrprotect(pte);
 292 #if 0   /* No longer needed with the new swap cache code */
 293                                 if (delete_from_swap_cache(&mem_map[page_nr]))
 294                                         pte = pte_mkdirty(pte);
 295 #endif
 296                                 set_pte(dst_pte, pte_mkold(pte));
 297                                 set_pte(src_pte, pte);
 298                                 atomic_inc(&mem_map[page_nr].count);
 299
 300 cont_copy_pte_range:            address += PAGE_SIZE;
 301                                 if (address >= end)
 302                                         goto out;
 303                                 src_pte++;
 304                                 dst_pte++;
 305                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 306
 307 cont_copy_pmd_range:    src_pmd++;
 308                         dst_pmd++;
 309                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 310         }
 311 out:
 312         return 0;
 313
 314 nomem:
 315         return -ENOMEM;
 316 }
 317
 318 /*
 319  * Return indicates whether a page was freed so caller can adjust rss
 320  */
 321 static inline int free_pte(pte_t page)
 322 {
 323         if (pte_present(page)) {
 324                 unsigned long addr = pte_page(page);
 325                 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 326                         return 0;
 327                 /*
 328                  * free_page() used to be able to clear swap cache
 329                  * entries.  We may now have to do it manually.
 330                  */
 331                 free_page_and_swap_cache(addr);
 332                 return 1;
 333         }
 334         swap_free(pte_val(page));
 335         return 0;
 336 }
 337
 338 static inline void forget_pte(pte_t page)
 339 {
 340         if (!pte_none(page)) {
 341                 printk("forget_pte: old mapping existed!\n");
 342                 free_pte(page);
 343         }
 344 }
 345
 346 static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
 347 {
 348         pte_t * pte;
 349         int freed;
 350
 351         if (pmd_none(*pmd))
 352                 return 0;
 353         if (pmd_bad(*pmd)) {
 354                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 355                 pmd_clear(pmd);
 356                 return 0;
 357         }
 358         pte = pte_offset(pmd, address);
 359         address &= ~PMD_MASK;
 360         if (address + size > PMD_SIZE)
 361                 size = PMD_SIZE - address;
 362         size >>= PAGE_SHIFT;
 363         freed = 0;
 364         for (;;) {
 365                 pte_t page;
 366                 if (!size)
 367                         break;
 368                 page = *pte;
 369                 pte++;
 370                 size--;
 371                 if (pte_none(page))
 372                         continue;
 373                 pte_clear(pte-1);
 374                 freed += free_pte(page);
 375         }
 376         return freed;
 377 }
 378
 379 static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
 380 {
 381         pmd_t * pmd;
 382         unsigned long end;
 383         int freed;
 384
 385         if (pgd_none(*dir))
 386                 return 0;
 387         if (pgd_bad(*dir)) {
 388                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 389                 pgd_clear(dir);
 390                 return 0;
 391         }
 392         pmd = pmd_offset(dir, address);
 393         address &= ~PGDIR_MASK;
 394         end = address + size;
 395         if (end > PGDIR_SIZE)
 396                 end = PGDIR_SIZE;
 397         freed = 0;
 398         do {
 399                 freed += zap_pte_range(pmd, address, end - address);
 400                 address = (address + PMD_SIZE) & PMD_MASK;
 401                 pmd++;
 402         } while (address < end);
 403         return freed;
 404 }
 405
 406 /*
 407  * remove user pages in a given range.
 408  */
 409 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 410 {
 411         pgd_t * dir;
 412         unsigned long end = address + size;
 413         int freed = 0;
 414
 415         dir = pgd_offset(mm, address);
 416         while (address < end) {
 417                 freed += zap_pmd_range(dir, address, end - address);
 418                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 419                 dir++;
 420         }
 421         /*
 422          * Update rss for the mm_struct (not necessarily current->mm)
 423          */
 424         if (mm->rss > 0) {
 425                 mm->rss -= freed;
 426                 if (mm->rss < 0)
 427                         mm->rss = 0;
 428         }
 429 }
 430
 431 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
 432 {
 433         unsigned long end;
 434
 435         address &= ~PMD_MASK;
 436         end = address + size;
 437         if (end > PMD_SIZE)
 438                 end = PMD_SIZE;
 439         do {
 440                 pte_t oldpage = *pte;
 441                 set_pte(pte, zero_pte);
 442                 forget_pte(oldpage);
 443                 address += PAGE_SIZE;
 444                 pte++;
 445         } while (address < end);
 446 }
 447
 448 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
 449 {
 450         unsigned long end;
 451
 452         address &= ~PGDIR_MASK;
 453         end = address + size;
 454         if (end > PGDIR_SIZE)
 455                 end = PGDIR_SIZE;
 456         do {
 457                 pte_t * pte = pte_alloc(pmd, address);
 458                 if (!pte)
 459                         return -ENOMEM;
 460                 zeromap_pte_range(pte, address, end - address, zero_pte);
 461                 address = (address + PMD_SIZE) & PMD_MASK;
 462                 pmd++;
 463         } while (address < end);
 464         return 0;
 465 }
 466
 467 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 468 {
 469         int error = 0;
 470         pgd_t * dir;
 471         unsigned long beg = address;
 472         unsigned long end = address + size;
 473         pte_t zero_pte;
 474
 475         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 476         dir = pgd_offset(current->mm, address);
 477         flush_cache_range(current->mm, beg, end);
 478         while (address < end) {
 479                 pmd_t *pmd = pmd_alloc(dir, address);
 480                 error = -ENOMEM;
 481                 if (!pmd)
 482                         break;
 483                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 484                 if (error)
 485                         break;
 486                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 487                 dir++;
 488         }
 489         flush_tlb_range(current->mm, beg, end);
 490         return error;
 491 }
 492
 493 /*
 494  * maps a range of physical memory into the requested pages. the old
 495  * mappings are removed. any references to nonexistent pages results
 496  * in null mappings (currently treated as "copy-on-access")
 497  */
 498 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 499         unsigned long phys_addr, pgprot_t prot)
 500 {
 501         unsigned long end;
 502
 503         address &= ~PMD_MASK;
 504         end = address + size;
 505         if (end > PMD_SIZE)
 506                 end = PMD_SIZE;
 507         do {
 508                 unsigned long mapnr;
 509                 pte_t oldpage = *pte;
 510                 pte_clear(pte);
 511
 512                 mapnr = MAP_NR(__va(phys_addr));
 513                 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 514                         set_pte(pte, mk_pte_phys(phys_addr, prot));
 515                 forget_pte(oldpage);
 516                 address += PAGE_SIZE;
 517                 phys_addr += PAGE_SIZE;
 518                 pte++;
 519         } while (address < end);
 520 }
 521
 522 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 523         unsigned long phys_addr, pgprot_t prot)
 524 {
 525         unsigned long end;
 526
 527         address &= ~PGDIR_MASK;
 528         end = address + size;
 529         if (end > PGDIR_SIZE)
 530                 end = PGDIR_SIZE;
 531         phys_addr -= address;
 532         do {
 533                 pte_t * pte = pte_alloc(pmd, address);
 534                 if (!pte)
 535                         return -ENOMEM;
 536                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 537                 address = (address + PMD_SIZE) & PMD_MASK;
 538                 pmd++;
 539         } while (address < end);
 540         return 0;
 541 }
 542
 543 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 544 {
 545         int error = 0;
 546         pgd_t * dir;
 547         unsigned long beg = from;
 548         unsigned long end = from + size;
 549
 550         phys_addr -= from;
 551         dir = pgd_offset(current->mm, from);
 552         flush_cache_range(current->mm, beg, end);
 553         while (from < end) {
 554                 pmd_t *pmd = pmd_alloc(dir, from);
 555                 error = -ENOMEM;
 556                 if (!pmd)
 557                         break;
 558                 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 559                 if (error)
 560                         break;
 561                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 562                 dir++;
 563         }
 564         flush_tlb_range(current->mm, beg, end);
 565         return error;
 566 }
 567
 568 /*
 569  * sanity-check function..
 570  */
 571 static void put_page(pte_t * page_table, pte_t pte)
 572 {
 573         if (!pte_none(*page_table)) {
 574                 free_page_and_swap_cache(pte_page(pte));
 575                 return;
 576         }
 577 /* no need for flush_tlb */
 578         set_pte(page_table, pte);
 579 }
 580
 581 /*
 582  * This routine is used to map in a page into an address space: needed by
 583  * execve() for the initial stack and environment pages.
 584  */
 585 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 586 {
 587         pgd_t * pgd;
 588         pmd_t * pmd;
 589         pte_t * pte;
 590
 591         if (MAP_NR(page) >= max_mapnr)
 592                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 593         if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
 594                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 595         pgd = pgd_offset(tsk->mm,address);
 596         pmd = pmd_alloc(pgd, address);
 597         if (!pmd) {
 598                 free_page(page);
 599                 oom(tsk);
 600                 return 0;
 601         }
 602         pte = pte_alloc(pmd, address);
 603         if (!pte) {
 604                 free_page(page);
 605                 oom(tsk);
 606                 return 0;
 607         }
 608         if (!pte_none(*pte)) {
 609                 printk("put_dirty_page: page already exists\n");
 610                 free_page(page);
 611                 return 0;
 612         }
 613         flush_page_to_ram(page);
 614         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 615 /* no need for flush_tlb */
 616         return page;
 617 }
 618
 619 /*
 620  * This routine handles present pages, when users try to write
 621  * to a shared page. It is done by copying the page to a new address
 622  * and decrementing the shared-page counter for the old page.
 623  *
 624  * Goto-purists beware: the only reason for goto's here is that it results
 625  * in better assembly code.. The "default" path will see no jumps at all.
 626  *
 627  * Note that this routine assumes that the protection checks have been
 628  * done by the caller (the low-level page fault routine in most cases).
 629  * Thus we can safely just mark it writable once we've done any necessary
 630  * COW.
 631  *
 632  * We also mark the page dirty at this point even though the page will
 633  * change only once the write actually happens. This avoids a few races,
 634  * and potentially makes it more efficient.
 635  */
 636 static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 637         unsigned long address, pte_t *page_table)
 638 {
 639         pte_t pte;
 640         unsigned long old_page, new_page;
 641         struct page * page_map;
 642
 643         pte = *page_table;
 644         new_page = __get_free_page(GFP_KERNEL);
 645         /* Did someone else copy this page for us while we slept? */
 646         if (pte_val(*page_table) != pte_val(pte))
 647                 goto end_wp_page;
 648         if (!pte_present(pte))
 649                 goto end_wp_page;
 650         if (pte_write(pte))
 651                 goto end_wp_page;
 652         old_page = pte_page(pte);
 653         if (MAP_NR(old_page) >= max_mapnr)
 654                 goto bad_wp_page;
 655         tsk->min_flt++;
 656         page_map = mem_map + MAP_NR(old_page);
 657
 658         /*
 659          * Do we need to copy?
 660          */
 661         if (is_page_shared(page_map)) {
 662                 if (new_page) {
 663                         if (PageReserved(mem_map + MAP_NR(old_page)))
 664                                 ++vma->vm_mm->rss;
 665                         copy_cow_page(old_page,new_page);
 666                         flush_page_to_ram(old_page);
 667                         flush_page_to_ram(new_page);
 668                         flush_cache_page(vma, address);
 669                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 670                         free_page(old_page);
 671                         flush_tlb_page(vma, address);
 672                         return;
 673                 }
 674                 flush_cache_page(vma, address);
 675                 set_pte(page_table, BAD_PAGE);
 676                 flush_tlb_page(vma, address);
 677                 free_page(old_page);
 678                 oom(tsk);
 679                 return;
 680         }
 681         if (PageSwapCache(page_map))
 682                 delete_from_swap_cache(page_map);
 683         flush_cache_page(vma, address);
 684         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 685         flush_tlb_page(vma, address);
 686         if (new_page)
 687                 free_page(new_page);
 688         return;
 689 bad_wp_page:
 690         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 691         send_sig(SIGKILL, tsk, 1);
 692 end_wp_page:
 693         if (new_page)
 694                 free_page(new_page);
 695         return;
 696 }
 697
 698 /*
 699  * This function zeroes out partial mmap'ed pages at truncation time..
 700  */
 701 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 702 {
 703         pgd_t *page_dir;
 704         pmd_t *page_middle;
 705         pte_t *page_table, pte;
 706
 707         page_dir = pgd_offset(vma->vm_mm, address);
 708         if (pgd_none(*page_dir))
 709                 return;
 710         if (pgd_bad(*page_dir)) {
 711                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 712                 pgd_clear(page_dir);
 713                 return;
 714         }
 715         page_middle = pmd_offset(page_dir, address);
 716         if (pmd_none(*page_middle))
 717                 return;
 718         if (pmd_bad(*page_middle)) {
 719                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 720                 pmd_clear(page_middle);
 721                 return;
 722         }
 723         page_table = pte_offset(page_middle, address);
 724         pte = *page_table;
 725         if (!pte_present(pte))
 726                 return;
 727         flush_cache_page(vma, address);
 728         address &= ~PAGE_MASK;
 729         address += pte_page(pte);
 730         if (MAP_NR(address) >= max_mapnr)
 731                 return;
 732         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 733         flush_page_to_ram(pte_page(pte));
 734 }
 735
 736 /*
 737  * Handle all mappings that got truncated by a "truncate()"
 738  * system call.
 739  *
 740  * NOTE! We have to be ready to update the memory sharing
 741  * between the file and the memory map for a potential last
 742  * incomplete page.  Ugly, but necessary.
 743  */
 744 void vmtruncate(struct inode * inode, unsigned long offset)
 745 {
 746         struct vm_area_struct * mpnt;
 747
 748         truncate_inode_pages(inode, offset);
 749         if (!inode->i_mmap)
 750                 return;
 751         mpnt = inode->i_mmap;
 752         do {
 753                 struct mm_struct *mm = mpnt->vm_mm;
 754                 unsigned long start = mpnt->vm_start;
 755                 unsigned long end = mpnt->vm_end;
 756                 unsigned long len = end - start;
 757                 unsigned long diff;
 758
 759                 /* mapping wholly truncated? */
 760                 if (mpnt->vm_offset >= offset) {
 761                         flush_cache_range(mm, start, end);
 762                         zap_page_range(mm, start, len);
 763                         flush_tlb_range(mm, start, end);
 764                         continue;
 765                 }
 766                 /* mapping wholly unaffected? */
 767                 diff = offset - mpnt->vm_offset;
 768                 if (diff >= len)
 769                         continue;
 770                 /* Ok, partially affected.. */
 771                 start += diff;
 772                 len = (len - diff) & PAGE_MASK;
 773                 if (start & ~PAGE_MASK) {
 774                         partial_clear(mpnt, start);
 775                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 776                 }
 777                 flush_cache_range(mm, start, end);
 778                 zap_page_range(mm, start, len);
 779                 flush_tlb_range(mm, start, end);
 780         } while ((mpnt = mpnt->vm_next_share) != NULL);
 781 }
 782
 783
 784 static inline void do_swap_page(struct task_struct * tsk,
 785         struct vm_area_struct * vma, unsigned long address,
 786         pte_t * page_table, pte_t entry, int write_access)
 787 {
 788         pte_t page;
 789
 790         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 791                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 792                 flush_page_to_ram(pte_page(*page_table));
 793                 return;
 794         }
 795         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 796         if (pte_val(*page_table) != pte_val(entry)) {
 797                 free_page(pte_page(page));
 798                 return;
 799         }
 800         if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
 801             !(vma->vm_flags & VM_SHARED))
 802                 page = pte_wrprotect(page);
 803         ++vma->vm_mm->rss;
 804         ++tsk->maj_flt;
 805         flush_page_to_ram(pte_page(page));
 806         set_pte(page_table, page);
 807         return;
 808 }
 809
 810 /*
 811  * do_no_page() tries to create a new page mapping. It aggressively
 812  * tries to share with existing pages, but makes a separate copy if
 813  * the "write_access" parameter is true in order to avoid the next
 814  * page fault.
 815  *
 816  * As this is called only for pages that do not currently exist, we
 817  * do not need to flush old virtual caches or the TLB.
 818  */
 819 static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 820         unsigned long address, int write_access, pte_t *page_table, pte_t entry)
 821 {
 822         unsigned long page;
 823
 824         if (!pte_none(entry))
 825                 goto swap_page;
 826         address &= PAGE_MASK;
 827         if (!vma->vm_ops || !vma->vm_ops->nopage)
 828                 goto anonymous_page;
 829         /*
 830          * The third argument is "no_share", which tells the low-level code
 831          * to copy, not share the page even if sharing is possible.  It's
 832          * essentially an early COW detection
 833          */
 834         page = vma->vm_ops->nopage(vma, address,
 835                 (vma->vm_flags & VM_SHARED)?0:write_access);
 836         if (!page)
 837                 goto sigbus;
 838         ++tsk->maj_flt;
 839         ++vma->vm_mm->rss;
 840         /*
 841          * This silly early PAGE_DIRTY setting removes a race
 842          * due to the bad i386 page protection. But it's valid
 843          * for other architectures too.
 844          *
 845          * Note that if write_access is true, we either now have
 846          * an exclusive copy of the page, or this is a shared mapping,
 847          * so we can make it writable and dirty to avoid having to
 848          * handle that later.
 849          */
 850         flush_page_to_ram(page);
 851         entry = mk_pte(page, vma->vm_page_prot);
 852         if (write_access) {
 853                 entry = pte_mkwrite(pte_mkdirty(entry));
 854         } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
 855                    !(vma->vm_flags & VM_SHARED))
 856                 entry = pte_wrprotect(entry);
 857         put_page(page_table, entry);
 858         /* no need to invalidate: a not-present page shouldn't be cached */
 859         return;
 860
 861 anonymous_page:
 862         entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 863         if (write_access) {
 864                 unsigned long page = __get_free_page(GFP_KERNEL);
 865                 if (!page)
 866                         goto sigbus;
 867                 clear_page(page);
 868                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 869                 vma->vm_mm->rss++;
 870                 tsk->min_flt++;
 871                 flush_page_to_ram(page);
 872         }
 873         put_page(page_table, entry);
 874         return;
 875
 876 sigbus:
 877         force_sig(SIGBUS, current);
 878         put_page(page_table, BAD_PAGE);
 879         /* no need to invalidate, wasn't present */
 880         return;
 881
 882 swap_page:
 883         do_swap_page(tsk, vma, address, page_table, entry, write_access);
 884         return;
 885 }
 886
 887 /*
 888  * These routines also need to handle stuff like marking pages dirty
 889  * and/or accessed for architectures that don't do it in hardware (most
 890  * RISC architectures).  The early dirtying is also good on the i386.
 891  *
 892  * There is also a hook called "update_mmu_cache()" that architectures
 893  * with external mmu caches can use to update those (ie the Sparc or
 894  * PowerPC hashed page tables that act as extended TLBs).
 895  */
 896 static inline void handle_pte_fault(struct task_struct *tsk,
 897         struct vm_area_struct * vma, unsigned long address,
 898         int write_access, pte_t * pte)
 899 {
 900         pte_t entry = *pte;
 901
 902         if (!pte_present(entry)) {
 903                 do_no_page(tsk, vma, address, write_access, pte, entry);
 904                 return;
 905         }
 906         entry = pte_mkyoung(entry);
 907         set_pte(pte, entry);
 908         flush_tlb_page(vma, address);
 909         if (!write_access)
 910                 return;
 911         if (pte_write(entry)) {
 912                 entry = pte_mkdirty(entry);
 913                 set_pte(pte, entry);
 914                 flush_tlb_page(vma, address);
 915                 return;
 916         }
 917         do_wp_page(tsk, vma, address, pte);
 918 }
 919
 920 /*
 921  * By the time we get here, we already hold the mm semaphore
 922  */
 923 void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 924         unsigned long address, int write_access)
 925 {
 926         pgd_t *pgd;
 927         pmd_t *pmd;
 928         pte_t *pte;
 929
 930         pgd = pgd_offset(vma->vm_mm, address);
 931         pmd = pmd_alloc(pgd, address);
 932         if (!pmd)
 933                 goto no_memory;
 934         pte = pte_alloc(pmd, address);
 935         if (!pte)
 936                 goto no_memory;
 937         lock_kernel();
 938         handle_pte_fault(tsk, vma, address, write_access, pte);
 939         unlock_kernel();
 940         update_mmu_cache(vma, address, *pte);
 941         return;
 942 no_memory:
 943         oom(tsk);
 944 }
 945
 946 /*
 947  * Simplistic page force-in..
 948  */
 949 void make_pages_present(unsigned long addr, unsigned long end)
 950 {
 951         int write;
 952         struct vm_area_struct * vma;
 953
 954         vma = find_vma(current->mm, addr);
 955         write = (vma->vm_flags & VM_WRITE) != 0;
 956         while (addr < end) {
 957                 handle_mm_fault(current, vma, addr, write);
 958                 addr += PAGE_SIZE;
 959         }
 960 }