mm/memory.c

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35
  36 #include <linux/mm.h>
  37 #include <linux/mman.h>
  38 #include <linux/swap.h>
  39 #include <linux/pagemap.h>
  40 #include <linux/smp_lock.h>
  41 #include <linux/swapctl.h>
  42
  43 #include <asm/uaccess.h>
  44 #include <asm/pgtable.h>
  45
  46 unsigned long max_mapnr = 0;
  47 unsigned long num_physpages = 0;
  48 void * high_memory = NULL;
  49
  50 /*
  51  * We special-case the C-O-W ZERO_PAGE, because it's such
  52  * a common occurrence (no need to read the page to know
  53  * that it's zero - better for the cache and memory subsystem).
  54  */
  55 static inline void copy_cow_page(unsigned long from, unsigned long to)
  56 {
  57         if (from == ZERO_PAGE(to)) {
  58                 clear_page(to);
  59                 return;
  60         }
  61         copy_page(to, from);
  62 }
  63
  64 mem_map_t * mem_map = NULL;
  65
  66 /*
  67  * oom() prints a message (so that the user knows why the process died),
  68  * and gives the process an untrappable SIGKILL.
  69  */
  70 void oom(struct task_struct * task)
  71 {
  72         printk("\nOut of memory for %s.\n", task->comm);
  73         force_sig(SIGKILL, task);
  74 }
  75
  76 /*
  77  * Note: this doesn't free the actual pages themselves. That
  78  * has been handled earlier when unmapping all the memory regions.
  79  */
  80 static inline void free_one_pmd(pmd_t * dir)
  81 {
  82         pte_t * pte;
  83
  84         if (pmd_none(*dir))
  85                 return;
  86         if (pmd_bad(*dir)) {
  87                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  88                 pmd_clear(dir);
  89                 return;
  90         }
  91         pte = pte_offset(dir, 0);
  92         pmd_clear(dir);
  93         pte_free(pte);
  94 }
  95
  96 static inline void free_one_pgd(pgd_t * dir)
  97 {
  98         int j;
  99         pmd_t * pmd;
 100
 101         if (pgd_none(*dir))
 102                 return;
 103         if (pgd_bad(*dir)) {
 104                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 105                 pgd_clear(dir);
 106                 return;
 107         }
 108         pmd = pmd_offset(dir, 0);
 109         pgd_clear(dir);
 110         for (j = 0; j < PTRS_PER_PMD ; j++)
 111                 free_one_pmd(pmd+j);
 112         pmd_free(pmd);
 113 }
 114
 115 /* Low and high watermarks for page table cache.
 116    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
 117  */
 118 int pgt_cache_water[2] = { 25, 50 };
 119
 120 /* Returns the number of pages freed */
 121 int check_pgt_cache(void)
 122 {
 123         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 124 }
 125
 126
 127 /*
 128  * This function clears all user-level page tables of a process - this
 129  * is needed by execve(), so that old pages aren't in the way.
 130  */
 131 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 132 {
 133         pgd_t * page_dir = mm->pgd;
 134
 135         page_dir += first;
 136         do {
 137                 free_one_pgd(page_dir);
 138                 page_dir++;
 139         } while (--nr);
 140
 141         /* keep the page table cache within bounds */
 142         check_pgt_cache();
 143 }
 144
 145 /*
 146  * This function just free's the page directory - the
 147  * pages tables themselves have been freed earlier by
 148  * clear_page_tables().
 149  */
 150 void free_page_tables(struct mm_struct * mm)
 151 {
 152         pgd_t * page_dir = mm->pgd;
 153
 154         if (page_dir) {
 155                 if (page_dir == swapper_pg_dir)
 156                         goto out_bad;
 157                 pgd_free(page_dir);
 158         }
 159         return;
 160
 161 out_bad:
 162         printk(KERN_ERR
 163                 "free_page_tables: Trying to free kernel pgd\n");
 164         return;
 165 }
 166
 167 int new_page_tables(struct task_struct * tsk)
 168 {
 169         pgd_t * new_pg;
 170
 171         if (!(new_pg = pgd_alloc()))
 172                 return -ENOMEM;
 173         SET_PAGE_DIR(tsk, new_pg);
 174         tsk->mm->pgd = new_pg;
 175         return 0;
 176 }
 177
 178 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 179 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 180
 181 /*
 182  * copy one vm_area from one task to the other. Assumes the page tables
 183  * already present in the new task to be cleared in the whole range
 184  * covered by this vma.
 185  *
 186  * 08Jan98 Merged into one routine from several inline routines to reduce
 187  *         variable count and make things faster. -jj
 188  */
 189 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 190                         struct vm_area_struct *vma)
 191 {
 192         pgd_t * src_pgd, * dst_pgd;
 193         unsigned long address = vma->vm_start;
 194         unsigned long end = vma->vm_end;
 195         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 196
 197         src_pgd = pgd_offset(src, address)-1;
 198         dst_pgd = pgd_offset(dst, address)-1;
 199
 200         for (;;) {
 201                 pmd_t * src_pmd, * dst_pmd;
 202
 203                 src_pgd++; dst_pgd++;
 204
 205                 /* copy_pmd_range */
 206
 207                 if (pgd_none(*src_pgd))
 208                         goto skip_copy_pmd_range;
 209                 if (pgd_bad(*src_pgd)) {
 210                         printk("copy_pmd_range: bad pgd (%08lx)\n",
 211                                 pgd_val(*src_pgd));
 212                         pgd_clear(src_pgd);
 213 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 214                         if (address >= end)
 215                                 goto out;
 216                         continue;
 217                 }
 218                 if (pgd_none(*dst_pgd)) {
 219                         if (!pmd_alloc(dst_pgd, 0))
 220                                 goto nomem;
 221                 }
 222
 223                 src_pmd = pmd_offset(src_pgd, address);
 224                 dst_pmd = pmd_offset(dst_pgd, address);
 225
 226                 do {
 227                         pte_t * src_pte, * dst_pte;
 228
 229                         /* copy_pte_range */
 230
 231                         if (pmd_none(*src_pmd))
 232                                 goto skip_copy_pte_range;
 233                         if (pmd_bad(*src_pmd)) {
 234                                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 235                                 pmd_clear(src_pmd);
 236 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 237                                 if (address >= end)
 238                                         goto out;
 239                                 goto cont_copy_pmd_range;
 240                         }
 241                         if (pmd_none(*dst_pmd)) {
 242                                 if (!pte_alloc(dst_pmd, 0))
 243                                         goto nomem;
 244                         }
 245
 246                         src_pte = pte_offset(src_pmd, address);
 247                         dst_pte = pte_offset(dst_pmd, address);
 248
 249                         do {
 250                                 pte_t pte = *src_pte;
 251                                 unsigned long page_nr;
 252
 253                                 /* copy_one_pte */
 254
 255                                 if (pte_none(pte))
 256                                         goto cont_copy_pte_range;
 257                                 if (!pte_present(pte)) {
 258                                         swap_duplicate(pte_val(pte));
 259                                         set_pte(dst_pte, pte);
 260                                         goto cont_copy_pte_range;
 261                                 }
 262                                 page_nr = MAP_NR(pte_page(pte));
 263                                 if (page_nr >= max_mapnr ||
 264                                     PageReserved(mem_map+page_nr)) {
 265                                         set_pte(dst_pte, pte);
 266                                         goto cont_copy_pte_range;
 267                                 }
 268                                 /* If it's a COW mapping, write protect it both in the parent and the child */
 269                                 if (cow) {
 270                                         pte = pte_wrprotect(pte);
 271                                         set_pte(src_pte, pte);
 272                                 }
 273                                 /* If it's a shared mapping, mark it clean in the child */
 274                                 if (vma->vm_flags & VM_SHARED)
 275                                         pte = pte_mkclean(pte);
 276                                 set_pte(dst_pte, pte_mkold(pte));
 277                                 get_page(mem_map + page_nr);
 278
 279 cont_copy_pte_range:            address += PAGE_SIZE;
 280                                 if (address >= end)
 281                                         goto out;
 282                                 src_pte++;
 283                                 dst_pte++;
 284                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 285
 286 cont_copy_pmd_range:    src_pmd++;
 287                         dst_pmd++;
 288                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 289         }
 290 out:
 291         return 0;
 292
 293 nomem:
 294         return -ENOMEM;
 295 }
 296
 297 /*
 298  * Return indicates whether a page was freed so caller can adjust rss
 299  */
 300 static inline int free_pte(pte_t page)
 301 {
 302         if (pte_present(page)) {
 303                 unsigned long addr = pte_page(page);
 304                 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 305                         return 0;
 306                 /*
 307                  * free_page() used to be able to clear swap cache
 308                  * entries.  We may now have to do it manually.
 309                  */
 310                 free_page_and_swap_cache(addr);
 311                 return 1;
 312         }
 313         swap_free(pte_val(page));
 314         return 0;
 315 }
 316
 317 static inline void forget_pte(pte_t page)
 318 {
 319         if (!pte_none(page)) {
 320                 printk("forget_pte: old mapping existed!\n");
 321                 free_pte(page);
 322         }
 323 }
 324
 325 static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size)
 326 {
 327         pte_t * pte;
 328         int freed;
 329
 330         if (pmd_none(*pmd))
 331                 return 0;
 332         if (pmd_bad(*pmd)) {
 333                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 334                 pmd_clear(pmd);
 335                 return 0;
 336         }
 337         pte = pte_offset(pmd, address);
 338         address &= ~PMD_MASK;
 339         if (address + size > PMD_SIZE)
 340                 size = PMD_SIZE - address;
 341         size >>= PAGE_SHIFT;
 342         freed = 0;
 343         for (;;) {
 344                 pte_t page;
 345                 if (!size)
 346                         break;
 347                 page = *pte;
 348                 pte++;
 349                 size--;
 350                 pte_clear(pte-1);
 351                 if (pte_none(page))
 352                         continue;
 353                 freed += free_pte(page);
 354         }
 355         return freed;
 356 }
 357
 358 static inline int zap_pmd_range(struct mm_struct *mm, pgd_t * dir, unsigned long address, unsigned long size)
 359 {
 360         pmd_t * pmd;
 361         unsigned long end;
 362         int freed;
 363
 364         if (pgd_none(*dir))
 365                 return 0;
 366         if (pgd_bad(*dir)) {
 367                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 368                 pgd_clear(dir);
 369                 return 0;
 370         }
 371         pmd = pmd_offset(dir, address);
 372         address &= ~PGDIR_MASK;
 373         end = address + size;
 374         if (end > PGDIR_SIZE)
 375                 end = PGDIR_SIZE;
 376         freed = 0;
 377         do {
 378                 freed += zap_pte_range(mm, pmd, address, end - address);
 379                 address = (address + PMD_SIZE) & PMD_MASK;
 380                 pmd++;
 381         } while (address < end);
 382         return freed;
 383 }
 384
 385 /*
 386  * remove user pages in a given range.
 387  */
 388 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 389 {
 390         pgd_t * dir;
 391         unsigned long end = address + size;
 392         int freed = 0;
 393
 394         dir = pgd_offset(mm, address);
 395
 396         /*
 397          * This is a long-lived spinlock. That's fine.
 398          * There's no contention, because the page table
 399          * lock only protects against kswapd anyway, and
 400          * even if kswapd happened to be looking at this
 401          * process we _want_ it to get stuck.
 402          */
 403         spin_lock(&mm->page_table_lock);
 404         while (address < end) {
 405                 freed += zap_pmd_range(mm, dir, address, end - address);
 406                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 407                 dir++;
 408         }
 409         spin_unlock(&mm->page_table_lock);
 410         /*
 411          * Update rss for the mm_struct (not necessarily current->mm)
 412          */
 413         if (mm->rss > 0) {
 414                 mm->rss -= freed;
 415                 if (mm->rss < 0)
 416                         mm->rss = 0;
 417         }
 418 }
 419
 420 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
 421                                      unsigned long size, pgprot_t prot)
 422 {
 423         unsigned long end;
 424
 425         address &= ~PMD_MASK;
 426         end = address + size;
 427         if (end > PMD_SIZE)
 428                 end = PMD_SIZE;
 429         do {
 430                 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address),
 431                                                prot));
 432                 pte_t oldpage = *pte;
 433                 set_pte(pte, zero_pte);
 434                 forget_pte(oldpage);
 435                 address += PAGE_SIZE;
 436                 pte++;
 437         } while (address < end);
 438 }
 439
 440 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
 441                                     unsigned long size, pgprot_t prot)
 442 {
 443         unsigned long end;
 444
 445         address &= ~PGDIR_MASK;
 446         end = address + size;
 447         if (end > PGDIR_SIZE)
 448                 end = PGDIR_SIZE;
 449         do {
 450                 pte_t * pte = pte_alloc(pmd, address);
 451                 if (!pte)
 452                         return -ENOMEM;
 453                 zeromap_pte_range(pte, address, end - address, prot);
 454                 address = (address + PMD_SIZE) & PMD_MASK;
 455                 pmd++;
 456         } while (address < end);
 457         return 0;
 458 }
 459
 460 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 461 {
 462         int error = 0;
 463         pgd_t * dir;
 464         unsigned long beg = address;
 465         unsigned long end = address + size;
 466
 467         dir = pgd_offset(current->mm, address);
 468         flush_cache_range(current->mm, beg, end);
 469         while (address < end) {
 470                 pmd_t *pmd = pmd_alloc(dir, address);
 471                 error = -ENOMEM;
 472                 if (!pmd)
 473                         break;
 474                 error = zeromap_pmd_range(pmd, address, end - address, prot);
 475                 if (error)
 476                         break;
 477                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 478                 dir++;
 479         }
 480         flush_tlb_range(current->mm, beg, end);
 481         return error;
 482 }
 483
 484 /*
 485  * maps a range of physical memory into the requested pages. the old
 486  * mappings are removed. any references to nonexistent pages results
 487  * in null mappings (currently treated as "copy-on-access")
 488  */
 489 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 490         unsigned long phys_addr, pgprot_t prot)
 491 {
 492         unsigned long end;
 493
 494         address &= ~PMD_MASK;
 495         end = address + size;
 496         if (end > PMD_SIZE)
 497                 end = PMD_SIZE;
 498         do {
 499                 unsigned long mapnr;
 500                 pte_t oldpage = *pte;
 501                 pte_clear(pte);
 502
 503                 mapnr = MAP_NR(__va(phys_addr));
 504                 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 505                         set_pte(pte, mk_pte_phys(phys_addr, prot));
 506                 forget_pte(oldpage);
 507                 address += PAGE_SIZE;
 508                 phys_addr += PAGE_SIZE;
 509                 pte++;
 510         } while (address < end);
 511 }
 512
 513 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 514         unsigned long phys_addr, pgprot_t prot)
 515 {
 516         unsigned long end;
 517
 518         address &= ~PGDIR_MASK;
 519         end = address + size;
 520         if (end > PGDIR_SIZE)
 521                 end = PGDIR_SIZE;
 522         phys_addr -= address;
 523         do {
 524                 pte_t * pte = pte_alloc(pmd, address);
 525                 if (!pte)
 526                         return -ENOMEM;
 527                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 528                 address = (address + PMD_SIZE) & PMD_MASK;
 529                 pmd++;
 530         } while (address < end);
 531         return 0;
 532 }
 533
 534 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 535 {
 536         int error = 0;
 537         pgd_t * dir;
 538         unsigned long beg = from;
 539         unsigned long end = from + size;
 540
 541         phys_addr -= from;
 542         dir = pgd_offset(current->mm, from);
 543         flush_cache_range(current->mm, beg, end);
 544         while (from < end) {
 545                 pmd_t *pmd = pmd_alloc(dir, from);
 546                 error = -ENOMEM;
 547                 if (!pmd)
 548                         break;
 549                 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 550                 if (error)
 551                         break;
 552                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 553                 dir++;
 554         }
 555         flush_tlb_range(current->mm, beg, end);
 556         return error;
 557 }
 558
 559 /*
 560  * This routine is used to map in a page into an address space: needed by
 561  * execve() for the initial stack and environment pages.
 562  */
 563 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 564 {
 565         pgd_t * pgd;
 566         pmd_t * pmd;
 567         pte_t * pte;
 568
 569         if (MAP_NR(page) >= max_mapnr)
 570                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 571         if (page_count(mem_map + MAP_NR(page)) != 1)
 572                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 573         pgd = pgd_offset(tsk->mm,address);
 574         pmd = pmd_alloc(pgd, address);
 575         if (!pmd) {
 576                 free_page(page);
 577                 oom(tsk);
 578                 return 0;
 579         }
 580         pte = pte_alloc(pmd, address);
 581         if (!pte) {
 582                 free_page(page);
 583                 oom(tsk);
 584                 return 0;
 585         }
 586         if (!pte_none(*pte)) {
 587                 printk("put_dirty_page: pte %08lx already exists\n",
 588                        pte_val(*pte));
 589                 free_page(page);
 590                 return 0;
 591         }
 592         flush_page_to_ram(page);
 593         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 594 /* no need for flush_tlb */
 595         return page;
 596 }
 597
 598 /*
 599  * This routine handles present pages, when users try to write
 600  * to a shared page. It is done by copying the page to a new address
 601  * and decrementing the shared-page counter for the old page.
 602  *
 603  * Goto-purists beware: the only reason for goto's here is that it results
 604  * in better assembly code.. The "default" path will see no jumps at all.
 605  *
 606  * Note that this routine assumes that the protection checks have been
 607  * done by the caller (the low-level page fault routine in most cases).
 608  * Thus we can safely just mark it writable once we've done any necessary
 609  * COW.
 610  *
 611  * We also mark the page dirty at this point even though the page will
 612  * change only once the write actually happens. This avoids a few races,
 613  * and potentially makes it more efficient.
 614  *
 615  * We enter with the page table read-lock held, and need to exit without
 616  * it.
 617  */
 618 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 619         unsigned long address, pte_t *page_table, pte_t pte)
 620 {
 621         unsigned long old_page, new_page;
 622         struct page * page;
 623
 624         old_page = pte_page(pte);
 625         if (MAP_NR(old_page) >= max_mapnr)
 626                 goto bad_wp_page;
 627         tsk->min_flt++;
 628         page = mem_map + MAP_NR(old_page);
 629
 630         /*
 631          * We can avoid the copy if:
 632          * - we're the only user (count == 1)
 633          * - the only other user is the swap cache,
 634          *   and the only swap cache user is itself,
 635          *   in which case we can remove the page
 636          *   from the swap cache.
 637          */
 638         switch (page_count(page)) {
 639         case 2:
 640                 if (!PageSwapCache(page))
 641                         break;
 642                 if (swap_count(page->offset) != 1)
 643                         break;
 644                 delete_from_swap_cache(page);
 645                 /* FallThrough */
 646         case 1:
 647                 flush_cache_page(vma, address);
 648                 set_pte(page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 649                 flush_tlb_page(vma, address);
 650                 spin_unlock(&tsk->mm->page_table_lock);
 651                 return 1;
 652         }
 653
 654         /*
 655          * Ok, we need to copy. Oh, well..
 656          */
 657         spin_unlock(&tsk->mm->page_table_lock);
 658         new_page = __get_free_page(GFP_USER);
 659         if (!new_page)
 660                 return -1;
 661         spin_lock(&tsk->mm->page_table_lock);
 662
 663         /*
 664          * Re-check the pte - we dropped the lock
 665          */
 666         if (pte_val(*page_table) == pte_val(pte)) {
 667                 if (PageReserved(page))
 668                         ++vma->vm_mm->rss;
 669                 copy_cow_page(old_page,new_page);
 670                 flush_page_to_ram(old_page);
 671                 flush_page_to_ram(new_page);
 672                 flush_cache_page(vma, address);
 673                 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 674                 flush_tlb_page(vma, address);
 675
 676                 /* Free the old page.. */
 677                 new_page = old_page;
 678         }
 679         spin_unlock(&tsk->mm->page_table_lock);
 680         free_page(new_page);
 681         return 1;
 682
 683 bad_wp_page:
 684         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 685         return -1;
 686 }
 687
 688 /*
 689  * This function zeroes out partial mmap'ed pages at truncation time..
 690  */
 691 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 692 {
 693         pgd_t *page_dir;
 694         pmd_t *page_middle;
 695         pte_t *page_table, pte;
 696
 697         page_dir = pgd_offset(vma->vm_mm, address);
 698         if (pgd_none(*page_dir))
 699                 return;
 700         if (pgd_bad(*page_dir)) {
 701                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 702                 pgd_clear(page_dir);
 703                 return;
 704         }
 705         page_middle = pmd_offset(page_dir, address);
 706         if (pmd_none(*page_middle))
 707                 return;
 708         if (pmd_bad(*page_middle)) {
 709                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 710                 pmd_clear(page_middle);
 711                 return;
 712         }
 713         page_table = pte_offset(page_middle, address);
 714         pte = *page_table;
 715         if (!pte_present(pte))
 716                 return;
 717         flush_cache_page(vma, address);
 718         address &= ~PAGE_MASK;
 719         address += pte_page(pte);
 720         if (MAP_NR(address) >= max_mapnr)
 721                 return;
 722         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 723         flush_page_to_ram(pte_page(pte));
 724 }
 725
 726 /*
 727  * Handle all mappings that got truncated by a "truncate()"
 728  * system call.
 729  *
 730  * NOTE! We have to be ready to update the memory sharing
 731  * between the file and the memory map for a potential last
 732  * incomplete page.  Ugly, but necessary.
 733  */
 734 void vmtruncate(struct inode * inode, unsigned long offset)
 735 {
 736         struct vm_area_struct * mpnt;
 737
 738         truncate_inode_pages(inode, offset);
 739         spin_lock(&inode->i_shared_lock);
 740         if (!inode->i_mmap)
 741                 goto out_unlock;
 742         mpnt = inode->i_mmap;
 743         do {
 744                 struct mm_struct *mm = mpnt->vm_mm;
 745                 unsigned long start = mpnt->vm_start;
 746                 unsigned long end = mpnt->vm_end;
 747                 unsigned long len = end - start;
 748                 unsigned long diff;
 749
 750                 /* mapping wholly truncated? */
 751                 if (mpnt->vm_offset >= offset) {
 752                         flush_cache_range(mm, start, end);
 753                         zap_page_range(mm, start, len);
 754                         flush_tlb_range(mm, start, end);
 755                         continue;
 756                 }
 757                 /* mapping wholly unaffected? */
 758                 diff = offset - mpnt->vm_offset;
 759                 if (diff >= len)
 760                         continue;
 761                 /* Ok, partially affected.. */
 762                 start += diff;
 763                 len = (len - diff) & PAGE_MASK;
 764                 if (start & ~PAGE_MASK) {
 765                         partial_clear(mpnt, start);
 766                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 767                 }
 768                 flush_cache_range(mm, start, end);
 769                 zap_page_range(mm, start, len);
 770                 flush_tlb_range(mm, start, end);
 771         } while ((mpnt = mpnt->vm_next_share) != NULL);
 772 out_unlock:
 773         spin_unlock(&inode->i_shared_lock);
 774 }
 775
 776
 777
 778 /*
 779  * Primitive swap readahead code. We simply read an aligned block of
 780  * (1 << page_cluster) entries in the swap area. This method is chosen
 781  * because it doesn't cost us any seek time.  We also make sure to queue
 782  * the 'original' request together with the readahead ones...
 783  */
 784 void swapin_readahead(unsigned long entry)
 785 {
 786         int i;
 787         struct page *new_page;
 788         unsigned long offset = SWP_OFFSET(entry);
 789         struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 790
 791         offset = (offset >> page_cluster) << page_cluster;
 792
 793         i = 1 << page_cluster;
 794         do {
 795                 /* Don't read-ahead past the end of the swap area */
 796                 if (offset >= swapdev->max)
 797                         break;
 798                 /* Don't block on I/O for read-ahead */
 799                 if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
 800                         break;
 801                 /* Don't read in bad or busy pages */
 802                 if (!swapdev->swap_map[offset])
 803                         break;
 804                 if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
 805                         break;
 806
 807                 /* Ok, do the async read-ahead now */
 808                 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
 809                 if (new_page != NULL)
 810                         __free_page(new_page);
 811                 offset++;
 812         } while (--i);
 813         return;
 814 }
 815
 816 static int do_swap_page(struct task_struct * tsk,
 817         struct vm_area_struct * vma, unsigned long address,
 818         pte_t * page_table, unsigned long entry, int write_access)
 819 {
 820         struct page *page = lookup_swap_cache(entry);
 821         pte_t pte;
 822
 823         if (!page) {
 824                 lock_kernel();
 825                 swapin_readahead(entry);
 826                 page = read_swap_cache(entry);
 827                 unlock_kernel();
 828                 if (!page)
 829                         return -1;
 830
 831                 flush_page_to_ram(page_address(page));
 832         }
 833
 834         vma->vm_mm->rss++;
 835         tsk->min_flt++;
 836         swap_free(entry);
 837
 838         pte = mk_pte(page_address(page), vma->vm_page_prot);
 839
 840         if (write_access && !is_page_shared(page)) {
 841                 delete_from_swap_cache(page);
 842                 pte = pte_mkwrite(pte_mkdirty(pte));
 843         }
 844         set_pte(page_table, pte);
 845         /* No need to invalidate - it was non-present before */
 846         update_mmu_cache(vma, address, pte);
 847         return 1;
 848 }
 849
 850 /*
 851  * This only needs the MM semaphore
 852  */
 853 static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 854 {
 855         pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 856         if (write_access) {
 857                 unsigned long page = __get_free_page(GFP_USER);
 858                 if (!page)
 859                         return -1;
 860                 clear_page(page);
 861                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 862                 vma->vm_mm->rss++;
 863                 tsk->min_flt++;
 864                 flush_page_to_ram(page);
 865         }
 866         set_pte(page_table, entry);
 867         /* No need to invalidate - it was non-present before */
 868         update_mmu_cache(vma, addr, entry);
 869         return 1;
 870 }
 871
 872 /*
 873  * do_no_page() tries to create a new page mapping. It aggressively
 874  * tries to share with existing pages, but makes a separate copy if
 875  * the "write_access" parameter is true in order to avoid the next
 876  * page fault.
 877  *
 878  * As this is called only for pages that do not currently exist, we
 879  * do not need to flush old virtual caches or the TLB.
 880  *
 881  * This is called with the MM semaphore and the kernel lock held.
 882  * We need to release the kernel lock as soon as possible..
 883  */
 884 static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 885         unsigned long address, int write_access, pte_t *page_table)
 886 {
 887         unsigned long page;
 888         pte_t entry;
 889
 890         if (!vma->vm_ops || !vma->vm_ops->nopage)
 891                 return do_anonymous_page(tsk, vma, page_table, write_access, address);
 892
 893         /*
 894          * The third argument is "no_share", which tells the low-level code
 895          * to copy, not share the page even if sharing is possible.  It's
 896          * essentially an early COW detection.
 897          */
 898         page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
 899         if (!page)
 900                 return 0;       /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
 901         if (page == -1)
 902                 return -1;      /* OOM */
 903
 904         ++tsk->maj_flt;
 905         ++vma->vm_mm->rss;
 906         /*
 907          * This silly early PAGE_DIRTY setting removes a race
 908          * due to the bad i386 page protection. But it's valid
 909          * for other architectures too.
 910          *
 911          * Note that if write_access is true, we either now have
 912          * an exclusive copy of the page, or this is a shared mapping,
 913          * so we can make it writable and dirty to avoid having to
 914          * handle that later.
 915          */
 916         flush_page_to_ram(page);
 917         entry = mk_pte(page, vma->vm_page_prot);
 918         if (write_access) {
 919                 entry = pte_mkwrite(pte_mkdirty(entry));
 920         } else if (page_count(mem_map+MAP_NR(page)) > 1 &&
 921                    !(vma->vm_flags & VM_SHARED))
 922                 entry = pte_wrprotect(entry);
 923         set_pte(page_table, entry);
 924         /* no need to invalidate: a not-present page shouldn't be cached */
 925         update_mmu_cache(vma, address, entry);
 926         return 1;
 927 }
 928
 929 /*
 930  * These routines also need to handle stuff like marking pages dirty
 931  * and/or accessed for architectures that don't do it in hardware (most
 932  * RISC architectures).  The early dirtying is also good on the i386.
 933  *
 934  * There is also a hook called "update_mmu_cache()" that architectures
 935  * with external mmu caches can use to update those (ie the Sparc or
 936  * PowerPC hashed page tables that act as extended TLBs).
 937  *
 938  * Note the "page_table_lock". It is to protect against kswapd removing
 939  * pages from under us. Note that kswapd only ever _removes_ pages, never
 940  * adds them. As such, once we have noticed that the page is not present,
 941  * we can drop the lock early.
 942  *
 943  * The adding of pages is protected by the MM semaphore (which we hold),
 944  * so we don't need to worry about a page being suddenly been added into
 945  * our VM.
 946  */
 947 static inline int handle_pte_fault(struct task_struct *tsk,
 948         struct vm_area_struct * vma, unsigned long address,
 949         int write_access, pte_t * pte)
 950 {
 951         pte_t entry;
 952
 953         entry = *pte;
 954         if (!pte_present(entry)) {
 955                 if (pte_none(entry))
 956                         return do_no_page(tsk, vma, address, write_access, pte);
 957                 return do_swap_page(tsk, vma, address, pte, pte_val(entry), write_access);
 958         }
 959
 960         /*
 961          * Ok, the entry was present, we need to get the page table
 962          * lock to synchronize with kswapd, and verify that the entry
 963          * didn't change from under us..
 964          */
 965         spin_lock(&tsk->mm->page_table_lock);
 966         if (pte_val(entry) == pte_val(*pte)) {
 967                 if (write_access) {
 968                         if (!pte_write(entry))
 969                                 return do_wp_page(tsk, vma, address, pte, entry);
 970
 971                         entry = pte_mkdirty(entry);
 972                 }
 973                 entry = pte_mkyoung(entry);
 974                 set_pte(pte, entry);
 975                 flush_tlb_page(vma, address);
 976                 update_mmu_cache(vma, address, entry);
 977         }
 978         spin_unlock(&tsk->mm->page_table_lock);
 979         return 1;
 980 }
 981
 982 /*
 983  * By the time we get here, we already hold the mm semaphore
 984  */
 985 int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 986         unsigned long address, int write_access)
 987 {
 988         pgd_t *pgd;
 989         pmd_t *pmd;
 990
 991         pgd = pgd_offset(vma->vm_mm, address);
 992         pmd = pmd_alloc(pgd, address);
 993         if (pmd) {
 994                 pte_t * pte = pte_alloc(pmd, address);
 995                 if (pte)
 996                         return handle_pte_fault(tsk, vma, address, write_access, pte);
 997         }
 998         return -1;
 999 }
1000
1001 /*
1002  * Simplistic page force-in..
1003  */
1004 int make_pages_present(unsigned long addr, unsigned long end)
1005 {
1006         int write;
1007         struct task_struct *tsk = current;
1008         struct vm_area_struct * vma;
1009
1010         vma = find_vma(tsk->mm, addr);
1011         write = (vma->vm_flags & VM_WRITE) != 0;
1012         while (addr < end) {
1013                 if (handle_mm_fault(tsk, vma, addr, write) < 0)
1014                         return -1;
1015                 addr += PAGE_SIZE;
1016         }
1017         return 0;
1018 }