mm/memory.c

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35
  36 #include <linux/mm.h>
  37 #include <linux/mman.h>
  38 #include <linux/swap.h>
  39 #include <linux/smp_lock.h>
  40
  41 #include <asm/uaccess.h>
  42 #include <asm/pgtable.h>
  43
  44 unsigned long max_mapnr = 0;
  45 unsigned long num_physpages = 0;
  46 void * high_memory = NULL;
  47
  48 /*
  49  * We special-case the C-O-W ZERO_PAGE, because it's such
  50  * a common occurrence (no need to read the page to know
  51  * that it's zero - better for the cache and memory subsystem).
  52  */
  53 static inline void copy_cow_page(unsigned long from, unsigned long to)
  54 {
  55         if (from == ZERO_PAGE) {
  56                 clear_page(to);
  57                 return;
  58         }
  59         copy_page(to, from);
  60 }
  61
  62 mem_map_t * mem_map = NULL;
  63
  64 /*
  65  * oom() prints a message (so that the user knows why the process died),
  66  * and gives the process an untrappable SIGKILL.
  67  */
  68 void oom(struct task_struct * task)
  69 {
  70         printk("\nOut of memory for %s.\n", task->comm);
  71         force_sig(SIGKILL, task);
  72 }
  73
  74 /*
  75  * Note: this doesn't free the actual pages themselves. That
  76  * has been handled earlier when unmapping all the memory regions.
  77  */
  78 static inline void free_one_pmd(pmd_t * dir)
  79 {
  80         pte_t * pte;
  81
  82         if (pmd_none(*dir))
  83                 return;
  84         if (pmd_bad(*dir)) {
  85                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  86                 pmd_clear(dir);
  87                 return;
  88         }
  89         pte = pte_offset(dir, 0);
  90         pmd_clear(dir);
  91         pte_free(pte);
  92 }
  93
  94 static inline void free_one_pgd(pgd_t * dir)
  95 {
  96         int j;
  97         pmd_t * pmd;
  98
  99         if (pgd_none(*dir))
 100                 return;
 101         if (pgd_bad(*dir)) {
 102                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 103                 pgd_clear(dir);
 104                 return;
 105         }
 106         pmd = pmd_offset(dir, 0);
 107         pgd_clear(dir);
 108         for (j = 0; j < PTRS_PER_PMD ; j++)
 109                 free_one_pmd(pmd+j);
 110         pmd_free(pmd);
 111 }
 112
 113 /* Low and high watermarks for page table cache.
 114    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
 115  */
 116 int pgt_cache_water[2] = { 25, 50 };
 117
 118 /* Returns the number of pages freed */
 119 int check_pgt_cache(void)
 120 {
 121         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 122 }
 123
 124
 125 /*
 126  * This function clears all user-level page tables of a process - this
 127  * is needed by execve(), so that old pages aren't in the way.
 128  */
 129 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 130 {
 131         pgd_t * page_dir = mm->pgd;
 132
 133         if (page_dir && page_dir != swapper_pg_dir) {
 134                 page_dir += first;
 135                 do {
 136                         free_one_pgd(page_dir);
 137                         page_dir++;
 138                 } while (--nr);
 139
 140                 /* keep the page table cache within bounds */
 141                 check_pgt_cache();
 142         }
 143 }
 144
 145 /*
 146  * This function just free's the page directory - the
 147  * pages tables themselves have been freed earlier by
 148  * clear_page_tables().
 149  */
 150 void free_page_tables(struct mm_struct * mm)
 151 {
 152         pgd_t * page_dir = mm->pgd;
 153
 154         if (page_dir) {
 155                 if (page_dir == swapper_pg_dir)
 156                         goto out_bad;
 157                 pgd_free(page_dir);
 158         }
 159         return;
 160
 161 out_bad:
 162         printk(KERN_ERR
 163                 "free_page_tables: Trying to free kernel pgd\n");
 164         return;
 165 }
 166
 167 int new_page_tables(struct task_struct * tsk)
 168 {
 169         pgd_t * new_pg;
 170
 171         if (!(new_pg = pgd_alloc()))
 172                 return -ENOMEM;
 173         SET_PAGE_DIR(tsk, new_pg);
 174         tsk->mm->pgd = new_pg;
 175         return 0;
 176 }
 177
 178 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 179 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 180
 181 /*
 182  * copy one vm_area from one task to the other. Assumes the page tables
 183  * already present in the new task to be cleared in the whole range
 184  * covered by this vma.
 185  *
 186  * 08Jan98 Merged into one routine from several inline routines to reduce
 187  *         variable count and make things faster. -jj
 188  */
 189 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 190                         struct vm_area_struct *vma)
 191 {
 192         pgd_t * src_pgd, * dst_pgd;
 193         unsigned long address = vma->vm_start;
 194         unsigned long end = vma->vm_end;
 195         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 196
 197         src_pgd = pgd_offset(src, address)-1;
 198         dst_pgd = pgd_offset(dst, address)-1;
 199
 200         for (;;) {
 201                 pmd_t * src_pmd, * dst_pmd;
 202
 203                 src_pgd++; dst_pgd++;
 204
 205                 /* copy_pmd_range */
 206
 207                 if (pgd_none(*src_pgd))
 208                         goto skip_copy_pmd_range;
 209                 if (pgd_bad(*src_pgd)) {
 210                         printk("copy_pmd_range: bad pgd (%08lx)\n",
 211                                 pgd_val(*src_pgd));
 212                         pgd_clear(src_pgd);
 213 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 214                         if (address >= end)
 215                                 goto out;
 216                         continue;
 217                 }
 218                 if (pgd_none(*dst_pgd)) {
 219                         if (!pmd_alloc(dst_pgd, 0))
 220                                 goto nomem;
 221                 }
 222
 223                 src_pmd = pmd_offset(src_pgd, address);
 224                 dst_pmd = pmd_offset(dst_pgd, address);
 225
 226                 do {
 227                         pte_t * src_pte, * dst_pte;
 228
 229                         /* copy_pte_range */
 230
 231                         if (pmd_none(*src_pmd))
 232                                 goto skip_copy_pte_range;
 233                         if (pmd_bad(*src_pmd)) {
 234                                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 235                                 pmd_clear(src_pmd);
 236 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 237                                 if (address >= end)
 238                                         goto out;
 239                                 goto cont_copy_pmd_range;
 240                         }
 241                         if (pmd_none(*dst_pmd)) {
 242                                 if (!pte_alloc(dst_pmd, 0))
 243                                         goto nomem;
 244                         }
 245
 246                         src_pte = pte_offset(src_pmd, address);
 247                         dst_pte = pte_offset(dst_pmd, address);
 248
 249                         do {
 250                                 pte_t pte = *src_pte;
 251                                 unsigned long page_nr;
 252
 253                                 /* copy_one_pte */
 254
 255                                 if (pte_none(pte))
 256                                         goto cont_copy_pte_range;
 257                                 if (!pte_present(pte)) {
 258                                         swap_duplicate(pte_val(pte));
 259                                         set_pte(dst_pte, pte);
 260                                         goto cont_copy_pte_range;
 261                                 }
 262                                 page_nr = MAP_NR(pte_page(pte));
 263                                 if (page_nr >= max_mapnr ||
 264                                     PageReserved(mem_map+page_nr)) {
 265                                         set_pte(dst_pte, pte);
 266                                         goto cont_copy_pte_range;
 267                                 }
 268                                 /* If it's a COW mapping, write protect it both in the parent and the child */
 269                                 if (cow) {
 270                                         pte = pte_wrprotect(pte);
 271                                         set_pte(src_pte, pte);
 272                                 }
 273                                 /* If it's a shared mapping, mark it clean in the child */
 274                                 if (vma->vm_flags & VM_SHARED)
 275                                         pte = pte_mkclean(pte);
 276                                 set_pte(dst_pte, pte_mkold(pte));
 277                                 atomic_inc(&mem_map[page_nr].count);
 278
 279 cont_copy_pte_range:            address += PAGE_SIZE;
 280                                 if (address >= end)
 281                                         goto out;
 282                                 src_pte++;
 283                                 dst_pte++;
 284                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 285
 286 cont_copy_pmd_range:    src_pmd++;
 287                         dst_pmd++;
 288                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 289         }
 290 out:
 291         return 0;
 292
 293 nomem:
 294         return -ENOMEM;
 295 }
 296
 297 /*
 298  * Return indicates whether a page was freed so caller can adjust rss
 299  */
 300 static inline int free_pte(pte_t page)
 301 {
 302         if (pte_present(page)) {
 303                 unsigned long addr = pte_page(page);
 304                 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 305                         return 0;
 306                 /*
 307                  * free_page() used to be able to clear swap cache
 308                  * entries.  We may now have to do it manually.
 309                  */
 310                 free_page_and_swap_cache(addr);
 311                 return 1;
 312         }
 313         swap_free(pte_val(page));
 314         return 0;
 315 }
 316
 317 static inline void forget_pte(pte_t page)
 318 {
 319         if (!pte_none(page)) {
 320                 printk("forget_pte: old mapping existed!\n");
 321                 free_pte(page);
 322         }
 323 }
 324
 325 static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
 326 {
 327         pte_t * pte;
 328         int freed;
 329
 330         if (pmd_none(*pmd))
 331                 return 0;
 332         if (pmd_bad(*pmd)) {
 333                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 334                 pmd_clear(pmd);
 335                 return 0;
 336         }
 337         pte = pte_offset(pmd, address);
 338         address &= ~PMD_MASK;
 339         if (address + size > PMD_SIZE)
 340                 size = PMD_SIZE - address;
 341         size >>= PAGE_SHIFT;
 342         freed = 0;
 343         for (;;) {
 344                 pte_t page;
 345                 if (!size)
 346                         break;
 347                 page = *pte;
 348                 pte++;
 349                 size--;
 350                 if (pte_none(page))
 351                         continue;
 352                 pte_clear(pte-1);
 353                 freed += free_pte(page);
 354         }
 355         return freed;
 356 }
 357
 358 static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
 359 {
 360         pmd_t * pmd;
 361         unsigned long end;
 362         int freed;
 363
 364         if (pgd_none(*dir))
 365                 return 0;
 366         if (pgd_bad(*dir)) {
 367                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 368                 pgd_clear(dir);
 369                 return 0;
 370         }
 371         pmd = pmd_offset(dir, address);
 372         address &= ~PGDIR_MASK;
 373         end = address + size;
 374         if (end > PGDIR_SIZE)
 375                 end = PGDIR_SIZE;
 376         freed = 0;
 377         do {
 378                 freed += zap_pte_range(pmd, address, end - address);
 379                 address = (address + PMD_SIZE) & PMD_MASK;
 380                 pmd++;
 381         } while (address < end);
 382         return freed;
 383 }
 384
 385 /*
 386  * remove user pages in a given range.
 387  */
 388 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 389 {
 390         pgd_t * dir;
 391         unsigned long end = address + size;
 392         int freed = 0;
 393
 394         dir = pgd_offset(mm, address);
 395         while (address < end) {
 396                 freed += zap_pmd_range(dir, address, end - address);
 397                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 398                 dir++;
 399         }
 400         /*
 401          * Update rss for the mm_struct (not necessarily current->mm)
 402          */
 403         if (mm->rss > 0) {
 404                 mm->rss -= freed;
 405                 if (mm->rss < 0)
 406                         mm->rss = 0;
 407         }
 408 }
 409
 410 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
 411 {
 412         unsigned long end;
 413
 414         address &= ~PMD_MASK;
 415         end = address + size;
 416         if (end > PMD_SIZE)
 417                 end = PMD_SIZE;
 418         do {
 419                 pte_t oldpage = *pte;
 420                 set_pte(pte, zero_pte);
 421                 forget_pte(oldpage);
 422                 address += PAGE_SIZE;
 423                 pte++;
 424         } while (address < end);
 425 }
 426
 427 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
 428 {
 429         unsigned long end;
 430
 431         address &= ~PGDIR_MASK;
 432         end = address + size;
 433         if (end > PGDIR_SIZE)
 434                 end = PGDIR_SIZE;
 435         do {
 436                 pte_t * pte = pte_alloc(pmd, address);
 437                 if (!pte)
 438                         return -ENOMEM;
 439                 zeromap_pte_range(pte, address, end - address, zero_pte);
 440                 address = (address + PMD_SIZE) & PMD_MASK;
 441                 pmd++;
 442         } while (address < end);
 443         return 0;
 444 }
 445
 446 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 447 {
 448         int error = 0;
 449         pgd_t * dir;
 450         unsigned long beg = address;
 451         unsigned long end = address + size;
 452         pte_t zero_pte;
 453
 454         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 455         dir = pgd_offset(current->mm, address);
 456         flush_cache_range(current->mm, beg, end);
 457         while (address < end) {
 458                 pmd_t *pmd = pmd_alloc(dir, address);
 459                 error = -ENOMEM;
 460                 if (!pmd)
 461                         break;
 462                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 463                 if (error)
 464                         break;
 465                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 466                 dir++;
 467         }
 468         flush_tlb_range(current->mm, beg, end);
 469         return error;
 470 }
 471
 472 /*
 473  * maps a range of physical memory into the requested pages. the old
 474  * mappings are removed. any references to nonexistent pages results
 475  * in null mappings (currently treated as "copy-on-access")
 476  */
 477 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 478         unsigned long phys_addr, pgprot_t prot)
 479 {
 480         unsigned long end;
 481
 482         address &= ~PMD_MASK;
 483         end = address + size;
 484         if (end > PMD_SIZE)
 485                 end = PMD_SIZE;
 486         do {
 487                 unsigned long mapnr;
 488                 pte_t oldpage = *pte;
 489                 pte_clear(pte);
 490
 491                 mapnr = MAP_NR(__va(phys_addr));
 492                 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 493                         set_pte(pte, mk_pte_phys(phys_addr, prot));
 494                 forget_pte(oldpage);
 495                 address += PAGE_SIZE;
 496                 phys_addr += PAGE_SIZE;
 497                 pte++;
 498         } while (address < end);
 499 }
 500
 501 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 502         unsigned long phys_addr, pgprot_t prot)
 503 {
 504         unsigned long end;
 505
 506         address &= ~PGDIR_MASK;
 507         end = address + size;
 508         if (end > PGDIR_SIZE)
 509                 end = PGDIR_SIZE;
 510         phys_addr -= address;
 511         do {
 512                 pte_t * pte = pte_alloc(pmd, address);
 513                 if (!pte)
 514                         return -ENOMEM;
 515                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 516                 address = (address + PMD_SIZE) & PMD_MASK;
 517                 pmd++;
 518         } while (address < end);
 519         return 0;
 520 }
 521
 522 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 523 {
 524         int error = 0;
 525         pgd_t * dir;
 526         unsigned long beg = from;
 527         unsigned long end = from + size;
 528
 529         phys_addr -= from;
 530         dir = pgd_offset(current->mm, from);
 531         flush_cache_range(current->mm, beg, end);
 532         while (from < end) {
 533                 pmd_t *pmd = pmd_alloc(dir, from);
 534                 error = -ENOMEM;
 535                 if (!pmd)
 536                         break;
 537                 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 538                 if (error)
 539                         break;
 540                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 541                 dir++;
 542         }
 543         flush_tlb_range(current->mm, beg, end);
 544         return error;
 545 }
 546
 547 /*
 548  * sanity-check function..
 549  */
 550 static void put_page(pte_t * page_table, pte_t pte)
 551 {
 552         if (!pte_none(*page_table)) {
 553                 free_page_and_swap_cache(pte_page(pte));
 554                 return;
 555         }
 556 /* no need for flush_tlb */
 557         set_pte(page_table, pte);
 558 }
 559
 560 /*
 561  * This routine is used to map in a page into an address space: needed by
 562  * execve() for the initial stack and environment pages.
 563  */
 564 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 565 {
 566         pgd_t * pgd;
 567         pmd_t * pmd;
 568         pte_t * pte;
 569
 570         if (MAP_NR(page) >= max_mapnr)
 571                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 572         if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
 573                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 574         pgd = pgd_offset(tsk->mm,address);
 575         pmd = pmd_alloc(pgd, address);
 576         if (!pmd) {
 577                 free_page(page);
 578                 oom(tsk);
 579                 return 0;
 580         }
 581         pte = pte_alloc(pmd, address);
 582         if (!pte) {
 583                 free_page(page);
 584                 oom(tsk);
 585                 return 0;
 586         }
 587         if (!pte_none(*pte)) {
 588                 printk("put_dirty_page: pte %08lx already exists\n",
 589                        pte_val(*pte));
 590                 free_page(page);
 591                 return 0;
 592         }
 593         flush_page_to_ram(page);
 594         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 595 /* no need for flush_tlb */
 596         return page;
 597 }
 598
 599 /*
 600  * This routine handles present pages, when users try to write
 601  * to a shared page. It is done by copying the page to a new address
 602  * and decrementing the shared-page counter for the old page.
 603  *
 604  * Goto-purists beware: the only reason for goto's here is that it results
 605  * in better assembly code.. The "default" path will see no jumps at all.
 606  *
 607  * Note that this routine assumes that the protection checks have been
 608  * done by the caller (the low-level page fault routine in most cases).
 609  * Thus we can safely just mark it writable once we've done any necessary
 610  * COW.
 611  *
 612  * We also mark the page dirty at this point even though the page will
 613  * change only once the write actually happens. This avoids a few races,
 614  * and potentially makes it more efficient.
 615  */
 616 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 617         unsigned long address, pte_t *page_table)
 618 {
 619         pte_t pte;
 620         unsigned long old_page, new_page;
 621         struct page * page_map;
 622
 623         pte = *page_table;
 624         new_page = __get_free_page(GFP_USER);
 625         /* Did someone else copy this page for us while we slept? */
 626         if (pte_val(*page_table) != pte_val(pte))
 627                 goto end_wp_page;
 628         if (!pte_present(pte))
 629                 goto end_wp_page;
 630         if (pte_write(pte))
 631                 goto end_wp_page;
 632         old_page = pte_page(pte);
 633         if (MAP_NR(old_page) >= max_mapnr)
 634                 goto bad_wp_page;
 635         tsk->min_flt++;
 636         page_map = mem_map + MAP_NR(old_page);
 637
 638         /*
 639          * We can avoid the copy if:
 640          * - we're the only user (count == 1)
 641          * - the only other user is the swap cache,
 642          *   and the only swap cache user is itself,
 643          *   in which case we can remove the page
 644          *   from the swap cache.
 645          */
 646         switch (atomic_read(&page_map->count)) {
 647         case 2:
 648                 if (!PageSwapCache(page_map))
 649                         break;
 650                 if (swap_count(page_map->offset) != 1)
 651                         break;
 652                 delete_from_swap_cache(page_map);
 653                 /* FallThrough */
 654         case 1:
 655                 /* We can release the kernel lock now.. */
 656                 unlock_kernel();
 657
 658                 flush_cache_page(vma, address);
 659                 set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 660                 flush_tlb_page(vma, address);
 661 end_wp_page:
 662                 if (new_page)
 663                         free_page(new_page);
 664                 return 1;
 665         }
 666
 667         unlock_kernel();
 668         if (!new_page)
 669                 return 0;
 670
 671         if (PageReserved(mem_map + MAP_NR(old_page)))
 672                 ++vma->vm_mm->rss;
 673         copy_cow_page(old_page,new_page);
 674         flush_page_to_ram(old_page);
 675         flush_page_to_ram(new_page);
 676         flush_cache_page(vma, address);
 677         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 678         free_page(old_page);
 679         flush_tlb_page(vma, address);
 680         return 1;
 681
 682 bad_wp_page:
 683         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 684         send_sig(SIGKILL, tsk, 1);
 685         if (new_page)
 686                 free_page(new_page);
 687         return 0;
 688 }
 689
 690 /*
 691  * This function zeroes out partial mmap'ed pages at truncation time..
 692  */
 693 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 694 {
 695         pgd_t *page_dir;
 696         pmd_t *page_middle;
 697         pte_t *page_table, pte;
 698
 699         page_dir = pgd_offset(vma->vm_mm, address);
 700         if (pgd_none(*page_dir))
 701                 return;
 702         if (pgd_bad(*page_dir)) {
 703                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 704                 pgd_clear(page_dir);
 705                 return;
 706         }
 707         page_middle = pmd_offset(page_dir, address);
 708         if (pmd_none(*page_middle))
 709                 return;
 710         if (pmd_bad(*page_middle)) {
 711                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 712                 pmd_clear(page_middle);
 713                 return;
 714         }
 715         page_table = pte_offset(page_middle, address);
 716         pte = *page_table;
 717         if (!pte_present(pte))
 718                 return;
 719         flush_cache_page(vma, address);
 720         address &= ~PAGE_MASK;
 721         address += pte_page(pte);
 722         if (MAP_NR(address) >= max_mapnr)
 723                 return;
 724         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 725         flush_page_to_ram(pte_page(pte));
 726 }
 727
 728 /*
 729  * Handle all mappings that got truncated by a "truncate()"
 730  * system call.
 731  *
 732  * NOTE! We have to be ready to update the memory sharing
 733  * between the file and the memory map for a potential last
 734  * incomplete page.  Ugly, but necessary.
 735  */
 736 void vmtruncate(struct inode * inode, unsigned long offset)
 737 {
 738         struct vm_area_struct * mpnt;
 739
 740         truncate_inode_pages(inode, offset);
 741         if (!inode->i_mmap)
 742                 return;
 743         mpnt = inode->i_mmap;
 744         do {
 745                 struct mm_struct *mm = mpnt->vm_mm;
 746                 unsigned long start = mpnt->vm_start;
 747                 unsigned long end = mpnt->vm_end;
 748                 unsigned long len = end - start;
 749                 unsigned long diff;
 750
 751                 /* mapping wholly truncated? */
 752                 if (mpnt->vm_offset >= offset) {
 753                         flush_cache_range(mm, start, end);
 754                         zap_page_range(mm, start, len);
 755                         flush_tlb_range(mm, start, end);
 756                         continue;
 757                 }
 758                 /* mapping wholly unaffected? */
 759                 diff = offset - mpnt->vm_offset;
 760                 if (diff >= len)
 761                         continue;
 762                 /* Ok, partially affected.. */
 763                 start += diff;
 764                 len = (len - diff) & PAGE_MASK;
 765                 if (start & ~PAGE_MASK) {
 766                         partial_clear(mpnt, start);
 767                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 768                 }
 769                 flush_cache_range(mm, start, end);
 770                 zap_page_range(mm, start, len);
 771                 flush_tlb_range(mm, start, end);
 772         } while ((mpnt = mpnt->vm_next_share) != NULL);
 773 }
 774
 775
 776 /*
 777  * This is called with the kernel lock held, we need
 778  * to return without it.
 779  */
 780 static int do_swap_page(struct task_struct * tsk,
 781         struct vm_area_struct * vma, unsigned long address,
 782         pte_t * page_table, pte_t entry, int write_access)
 783 {
 784         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 785                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 786                 flush_page_to_ram(pte_page(*page_table));
 787         } else {
 788                 pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 789                 if (pte_val(*page_table) != pte_val(entry)) {
 790                         free_page(pte_page(page));
 791                 } else {
 792                         if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
 793                             !(vma->vm_flags & VM_SHARED))
 794                                 page = pte_wrprotect(page);
 795                         ++vma->vm_mm->rss;
 796                         ++tsk->maj_flt;
 797                         flush_page_to_ram(pte_page(page));
 798                         set_pte(page_table, page);
 799                 }
 800         }
 801         unlock_kernel();
 802         return 1;
 803 }
 804
 805 /*
 806  * This only needs the MM semaphore
 807  */
 808 static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access)
 809 {
 810         pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 811         if (write_access) {
 812                 unsigned long page = __get_free_page(GFP_USER);
 813                 if (!page)
 814                         return 0;
 815                 clear_page(page);
 816                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 817                 vma->vm_mm->rss++;
 818                 tsk->min_flt++;
 819                 flush_page_to_ram(page);
 820         }
 821         put_page(page_table, entry);
 822         return 1;
 823 }
 824
 825 /*
 826  * do_no_page() tries to create a new page mapping. It aggressively
 827  * tries to share with existing pages, but makes a separate copy if
 828  * the "write_access" parameter is true in order to avoid the next
 829  * page fault.
 830  *
 831  * As this is called only for pages that do not currently exist, we
 832  * do not need to flush old virtual caches or the TLB.
 833  *
 834  * This is called with the MM semaphore and the kernel lock held.
 835  * We need to release the kernel lock as soon as possible..
 836  */
 837 static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 838         unsigned long address, int write_access, pte_t *page_table)
 839 {
 840         unsigned long page;
 841         pte_t entry;
 842
 843         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 844                 unlock_kernel();
 845                 return do_anonymous_page(tsk, vma, page_table, write_access);
 846         }
 847
 848         /*
 849          * The third argument is "no_share", which tells the low-level code
 850          * to copy, not share the page even if sharing is possible.  It's
 851          * essentially an early COW detection.
 852          */
 853         page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
 854                 (vma->vm_flags & VM_SHARED)?0:write_access);
 855
 856         unlock_kernel();
 857         if (!page)
 858                 return 0;
 859
 860         ++tsk->maj_flt;
 861         ++vma->vm_mm->rss;
 862         /*
 863          * This silly early PAGE_DIRTY setting removes a race
 864          * due to the bad i386 page protection. But it's valid
 865          * for other architectures too.
 866          *
 867          * Note that if write_access is true, we either now have
 868          * an exclusive copy of the page, or this is a shared mapping,
 869          * so we can make it writable and dirty to avoid having to
 870          * handle that later.
 871          */
 872         flush_page_to_ram(page);
 873         entry = mk_pte(page, vma->vm_page_prot);
 874         if (write_access) {
 875                 entry = pte_mkwrite(pte_mkdirty(entry));
 876         } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
 877                    !(vma->vm_flags & VM_SHARED))
 878                 entry = pte_wrprotect(entry);
 879         put_page(page_table, entry);
 880         /* no need to invalidate: a not-present page shouldn't be cached */
 881         return 1;
 882 }
 883
 884 /*
 885  * These routines also need to handle stuff like marking pages dirty
 886  * and/or accessed for architectures that don't do it in hardware (most
 887  * RISC architectures).  The early dirtying is also good on the i386.
 888  *
 889  * There is also a hook called "update_mmu_cache()" that architectures
 890  * with external mmu caches can use to update those (ie the Sparc or
 891  * PowerPC hashed page tables that act as extended TLBs).
 892  */
 893 static inline int handle_pte_fault(struct task_struct *tsk,
 894         struct vm_area_struct * vma, unsigned long address,
 895         int write_access, pte_t * pte)
 896 {
 897         pte_t entry;
 898
 899         lock_kernel();
 900         entry = *pte;
 901
 902         if (!pte_present(entry)) {
 903                 if (pte_none(entry))
 904                         return do_no_page(tsk, vma, address, write_access, pte);
 905                 return do_swap_page(tsk, vma, address, pte, entry, write_access);
 906         }
 907
 908         entry = pte_mkyoung(entry);
 909         set_pte(pte, entry);
 910         flush_tlb_page(vma, address);
 911         if (write_access) {
 912                 if (!pte_write(entry))
 913                         return do_wp_page(tsk, vma, address, pte);
 914
 915                 entry = pte_mkdirty(entry);
 916                 set_pte(pte, entry);
 917                 flush_tlb_page(vma, address);
 918         }
 919         unlock_kernel();
 920         return 1;
 921 }
 922
 923 /*
 924  * By the time we get here, we already hold the mm semaphore
 925  */
 926 int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 927         unsigned long address, int write_access)
 928 {
 929         pgd_t *pgd;
 930         pmd_t *pmd;
 931
 932         pgd = pgd_offset(vma->vm_mm, address);
 933         pmd = pmd_alloc(pgd, address);
 934         if (pmd) {
 935                 pte_t * pte = pte_alloc(pmd, address);
 936                 if (pte) {
 937                         if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
 938                                 update_mmu_cache(vma, address, *pte);
 939                                 return 1;
 940                         }
 941                 }
 942         }
 943         return 0;
 944 }
 945
 946 /*
 947  * Simplistic page force-in..
 948  */
 949 void make_pages_present(unsigned long addr, unsigned long end)
 950 {
 951         int write;
 952         struct vm_area_struct * vma;
 953
 954         vma = find_vma(current->mm, addr);
 955         write = (vma->vm_flags & VM_WRITE) != 0;
 956         while (addr < end) {
 957                 handle_mm_fault(current, vma, addr, write);
 958                 addr += PAGE_SIZE;
 959         }
 960 }