mm/memory.c

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/kernel.h>
  39 #include <linux/errno.h>
  40 #include <linux/string.h>
  41 #include <linux/types.h>
  42 #include <linux/ptrace.h>
  43 #include <linux/mman.h>
  44 #include <linux/mm.h>
  45 #include <linux/swap.h>
  46 #include <linux/smp.h>
  47 #include <linux/smp_lock.h>
  48
  49 #include <asm/system.h>
  50 #include <asm/uaccess.h>
  51 #include <asm/pgtable.h>
  52 #include <asm/string.h>
  53
  54 unsigned long max_mapnr = 0;
  55 unsigned long num_physpages = 0;
  56 void * high_memory = NULL;
  57
  58 /*
  59  * We special-case the C-O-W ZERO_PAGE, because it's such
  60  * a common occurrence (no need to read the page to know
  61  * that it's zero - better for the cache and memory subsystem).
  62  */
  63 static inline void copy_cow_page(unsigned long from, unsigned long to)
  64 {
  65         if (from == ZERO_PAGE) {
  66                 clear_page(to);
  67                 return;
  68         }
  69         copy_page(to, from);
  70 }
  71
  72 mem_map_t * mem_map = NULL;
  73
  74 /*
  75  * oom() prints a message (so that the user knows why the process died),
  76  * and gives the process an untrappable SIGKILL.
  77  */
  78 void oom(struct task_struct * task)
  79 {
  80         printk("\nOut of memory for %s.\n", task->comm);
  81         force_sig(SIGKILL, task);
  82 }
  83
  84 /*
  85  * Note: this doesn't free the actual pages themselves. That
  86  * has been handled earlier when unmapping all the memory regions.
  87  */
  88 static inline void free_one_pmd(pmd_t * dir)
  89 {
  90         pte_t * pte;
  91
  92         if (pmd_none(*dir))
  93                 return;
  94         if (pmd_bad(*dir)) {
  95                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  96                 pmd_clear(dir);
  97                 return;
  98         }
  99         pte = pte_offset(dir, 0);
 100         pmd_clear(dir);
 101         pte_free(pte);
 102 }
 103
 104 static inline void free_one_pgd(pgd_t * dir)
 105 {
 106         int j;
 107         pmd_t * pmd;
 108
 109         if (pgd_none(*dir))
 110                 return;
 111         if (pgd_bad(*dir)) {
 112                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 113                 pgd_clear(dir);
 114                 return;
 115         }
 116         pmd = pmd_offset(dir, 0);
 117         pgd_clear(dir);
 118         for (j = 0; j < PTRS_PER_PMD ; j++)
 119                 free_one_pmd(pmd+j);
 120         pmd_free(pmd);
 121 }
 122
 123 /* Low and high watermarks for page table cache.
 124    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
 125  */
 126 int pgt_cache_water[2] = { 25, 50 };
 127
 128 /* Returns the number of pages freed */
 129 int check_pgt_cache(void)
 130 {
 131         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 132 }
 133
 134
 135 /*
 136  * This function clears all user-level page tables of a process - this
 137  * is needed by execve(), so that old pages aren't in the way.
 138  */
 139 void clear_page_tables(struct task_struct * tsk)
 140 {
 141         pgd_t * page_dir = tsk->mm->pgd;
 142         int i;
 143
 144         if (!page_dir || page_dir == swapper_pg_dir)
 145                 goto out_bad;
 146         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 147                 free_one_pgd(page_dir + i);
 148
 149         /* keep the page table cache within bounds */
 150         check_pgt_cache();
 151         return;
 152
 153 out_bad:
 154         printk(KERN_ERR
 155                 "clear_page_tables: %s trying to clear kernel pgd\n",
 156                 tsk->comm);
 157         return;
 158 }
 159
 160 /*
 161  * This function frees up all page tables of a process when it exits. It
 162  * is the same as "clear_page_tables()", except it also frees the old
 163  * page table directory.
 164  */
 165 void free_page_tables(struct mm_struct * mm)
 166 {
 167         pgd_t * page_dir = mm->pgd;
 168         int i;
 169
 170         if (!page_dir)
 171                 goto out;
 172         if (page_dir == swapper_pg_dir)
 173                 goto out_bad;
 174         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 175                 free_one_pgd(page_dir + i);
 176         pgd_free(page_dir);
 177
 178         /* keep the page table cache within bounds */
 179         check_pgt_cache();
 180 out:
 181         return;
 182
 183 out_bad:
 184         printk(KERN_ERR
 185                 "free_page_tables: Trying to free kernel pgd\n");
 186         return;
 187 }
 188
 189 int new_page_tables(struct task_struct * tsk)
 190 {
 191         pgd_t * new_pg;
 192
 193         if (!(new_pg = pgd_alloc()))
 194                 return -ENOMEM;
 195         SET_PAGE_DIR(tsk, new_pg);
 196         tsk->mm->pgd = new_pg;
 197         return 0;
 198 }
 199
 200 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 201 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 202
 203 /*
 204  * copy one vm_area from one task to the other. Assumes the page tables
 205  * already present in the new task to be cleared in the whole range
 206  * covered by this vma.
 207  *
 208  * 08Jan98 Merged into one routine from several inline routines to reduce
 209  *         variable count and make things faster. -jj
 210  */
 211 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 212                         struct vm_area_struct *vma)
 213 {
 214         pgd_t * src_pgd, * dst_pgd;
 215         unsigned long address = vma->vm_start;
 216         unsigned long end = vma->vm_end;
 217         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 218
 219         src_pgd = pgd_offset(src, address)-1;
 220         dst_pgd = pgd_offset(dst, address)-1;
 221
 222         for (;;) {
 223                 pmd_t * src_pmd, * dst_pmd;
 224
 225                 src_pgd++; dst_pgd++;
 226
 227                 /* copy_pmd_range */
 228
 229                 if (pgd_none(*src_pgd))
 230                         goto skip_copy_pmd_range;
 231                 if (pgd_bad(*src_pgd)) {
 232                         printk("copy_pmd_range: bad pgd (%08lx)\n",
 233                                 pgd_val(*src_pgd));
 234                         pgd_clear(src_pgd);
 235 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 236                         if (address >= end)
 237                                 goto out;
 238                         continue;
 239                 }
 240                 if (pgd_none(*dst_pgd)) {
 241                         if (!pmd_alloc(dst_pgd, 0))
 242                                 goto nomem;
 243                 }
 244
 245                 src_pmd = pmd_offset(src_pgd, address);
 246                 dst_pmd = pmd_offset(dst_pgd, address);
 247
 248                 do {
 249                         pte_t * src_pte, * dst_pte;
 250
 251                         /* copy_pte_range */
 252
 253                         if (pmd_none(*src_pmd))
 254                                 goto skip_copy_pte_range;
 255                         if (pmd_bad(*src_pmd)) {
 256                                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 257                                 pmd_clear(src_pmd);
 258 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 259                                 if (address >= end)
 260                                         goto out;
 261                                 goto cont_copy_pmd_range;
 262                         }
 263                         if (pmd_none(*dst_pmd)) {
 264                                 if (!pte_alloc(dst_pmd, 0))
 265                                         goto nomem;
 266                         }
 267
 268                         src_pte = pte_offset(src_pmd, address);
 269                         dst_pte = pte_offset(dst_pmd, address);
 270
 271                         do {
 272                                 pte_t pte = *src_pte;
 273                                 unsigned long page_nr;
 274
 275                                 /* copy_one_pte */
 276
 277                                 if (pte_none(pte))
 278                                         goto cont_copy_pte_range;
 279                                 if (!pte_present(pte)) {
 280                                         swap_duplicate(pte_val(pte));
 281                                         set_pte(dst_pte, pte);
 282                                         goto cont_copy_pte_range;
 283                                 }
 284                                 page_nr = MAP_NR(pte_page(pte));
 285                                 if (page_nr >= max_mapnr ||
 286                                     PageReserved(mem_map+page_nr)) {
 287                                         set_pte(dst_pte, pte);
 288                                         goto cont_copy_pte_range;
 289                                 }
 290                                 if (cow)
 291                                         pte = pte_wrprotect(pte);
 292                                 set_pte(dst_pte, pte_mkold(pte));
 293                                 set_pte(src_pte, pte);
 294                                 atomic_inc(&mem_map[page_nr].count);
 295
 296 cont_copy_pte_range:            address += PAGE_SIZE;
 297                                 if (address >= end)
 298                                         goto out;
 299                                 src_pte++;
 300                                 dst_pte++;
 301                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 302
 303 cont_copy_pmd_range:    src_pmd++;
 304                         dst_pmd++;
 305                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 306         }
 307 out:
 308         return 0;
 309
 310 nomem:
 311         return -ENOMEM;
 312 }
 313
 314 /*
 315  * Return indicates whether a page was freed so caller can adjust rss
 316  */
 317 static inline int free_pte(pte_t page)
 318 {
 319         if (pte_present(page)) {
 320                 unsigned long addr = pte_page(page);
 321                 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 322                         return 0;
 323                 /*
 324                  * free_page() used to be able to clear swap cache
 325                  * entries.  We may now have to do it manually.
 326                  */
 327                 free_page_and_swap_cache(addr);
 328                 return 1;
 329         }
 330         swap_free(pte_val(page));
 331         return 0;
 332 }
 333
 334 static inline void forget_pte(pte_t page)
 335 {
 336         if (!pte_none(page)) {
 337                 printk("forget_pte: old mapping existed!\n");
 338                 free_pte(page);
 339         }
 340 }
 341
 342 static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
 343 {
 344         pte_t * pte;
 345         int freed;
 346
 347         if (pmd_none(*pmd))
 348                 return 0;
 349         if (pmd_bad(*pmd)) {
 350                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 351                 pmd_clear(pmd);
 352                 return 0;
 353         }
 354         pte = pte_offset(pmd, address);
 355         address &= ~PMD_MASK;
 356         if (address + size > PMD_SIZE)
 357                 size = PMD_SIZE - address;
 358         size >>= PAGE_SHIFT;
 359         freed = 0;
 360         for (;;) {
 361                 pte_t page;
 362                 if (!size)
 363                         break;
 364                 page = *pte;
 365                 pte++;
 366                 size--;
 367                 if (pte_none(page))
 368                         continue;
 369                 pte_clear(pte-1);
 370                 freed += free_pte(page);
 371         }
 372         return freed;
 373 }
 374
 375 static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
 376 {
 377         pmd_t * pmd;
 378         unsigned long end;
 379         int freed;
 380
 381         if (pgd_none(*dir))
 382                 return 0;
 383         if (pgd_bad(*dir)) {
 384                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 385                 pgd_clear(dir);
 386                 return 0;
 387         }
 388         pmd = pmd_offset(dir, address);
 389         address &= ~PGDIR_MASK;
 390         end = address + size;
 391         if (end > PGDIR_SIZE)
 392                 end = PGDIR_SIZE;
 393         freed = 0;
 394         do {
 395                 freed += zap_pte_range(pmd, address, end - address);
 396                 address = (address + PMD_SIZE) & PMD_MASK;
 397                 pmd++;
 398         } while (address < end);
 399         return freed;
 400 }
 401
 402 /*
 403  * remove user pages in a given range.
 404  */
 405 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 406 {
 407         pgd_t * dir;
 408         unsigned long end = address + size;
 409         int freed = 0;
 410
 411         dir = pgd_offset(mm, address);
 412         while (address < end) {
 413                 freed += zap_pmd_range(dir, address, end - address);
 414                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 415                 dir++;
 416         }
 417         /*
 418          * Update rss for the mm_struct (not necessarily current->mm)
 419          */
 420         if (mm->rss > 0) {
 421                 mm->rss -= freed;
 422                 if (mm->rss < 0)
 423                         mm->rss = 0;
 424         }
 425 }
 426
 427 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
 428 {
 429         unsigned long end;
 430
 431         address &= ~PMD_MASK;
 432         end = address + size;
 433         if (end > PMD_SIZE)
 434                 end = PMD_SIZE;
 435         do {
 436                 pte_t oldpage = *pte;
 437                 set_pte(pte, zero_pte);
 438                 forget_pte(oldpage);
 439                 address += PAGE_SIZE;
 440                 pte++;
 441         } while (address < end);
 442 }
 443
 444 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
 445 {
 446         unsigned long end;
 447
 448         address &= ~PGDIR_MASK;
 449         end = address + size;
 450         if (end > PGDIR_SIZE)
 451                 end = PGDIR_SIZE;
 452         do {
 453                 pte_t * pte = pte_alloc(pmd, address);
 454                 if (!pte)
 455                         return -ENOMEM;
 456                 zeromap_pte_range(pte, address, end - address, zero_pte);
 457                 address = (address + PMD_SIZE) & PMD_MASK;
 458                 pmd++;
 459         } while (address < end);
 460         return 0;
 461 }
 462
 463 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 464 {
 465         int error = 0;
 466         pgd_t * dir;
 467         unsigned long beg = address;
 468         unsigned long end = address + size;
 469         pte_t zero_pte;
 470
 471         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 472         dir = pgd_offset(current->mm, address);
 473         flush_cache_range(current->mm, beg, end);
 474         while (address < end) {
 475                 pmd_t *pmd = pmd_alloc(dir, address);
 476                 error = -ENOMEM;
 477                 if (!pmd)
 478                         break;
 479                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 480                 if (error)
 481                         break;
 482                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 483                 dir++;
 484         }
 485         flush_tlb_range(current->mm, beg, end);
 486         return error;
 487 }
 488
 489 /*
 490  * maps a range of physical memory into the requested pages. the old
 491  * mappings are removed. any references to nonexistent pages results
 492  * in null mappings (currently treated as "copy-on-access")
 493  */
 494 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 495         unsigned long phys_addr, pgprot_t prot)
 496 {
 497         unsigned long end;
 498
 499         address &= ~PMD_MASK;
 500         end = address + size;
 501         if (end > PMD_SIZE)
 502                 end = PMD_SIZE;
 503         do {
 504                 unsigned long mapnr;
 505                 pte_t oldpage = *pte;
 506                 pte_clear(pte);
 507
 508                 mapnr = MAP_NR(__va(phys_addr));
 509                 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 510                         set_pte(pte, mk_pte_phys(phys_addr, prot));
 511                 forget_pte(oldpage);
 512                 address += PAGE_SIZE;
 513                 phys_addr += PAGE_SIZE;
 514                 pte++;
 515         } while (address < end);
 516 }
 517
 518 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 519         unsigned long phys_addr, pgprot_t prot)
 520 {
 521         unsigned long end;
 522
 523         address &= ~PGDIR_MASK;
 524         end = address + size;
 525         if (end > PGDIR_SIZE)
 526                 end = PGDIR_SIZE;
 527         phys_addr -= address;
 528         do {
 529                 pte_t * pte = pte_alloc(pmd, address);
 530                 if (!pte)
 531                         return -ENOMEM;
 532                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 533                 address = (address + PMD_SIZE) & PMD_MASK;
 534                 pmd++;
 535         } while (address < end);
 536         return 0;
 537 }
 538
 539 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 540 {
 541         int error = 0;
 542         pgd_t * dir;
 543         unsigned long beg = from;
 544         unsigned long end = from + size;
 545
 546         phys_addr -= from;
 547         dir = pgd_offset(current->mm, from);
 548         flush_cache_range(current->mm, beg, end);
 549         while (from < end) {
 550                 pmd_t *pmd = pmd_alloc(dir, from);
 551                 error = -ENOMEM;
 552                 if (!pmd)
 553                         break;
 554                 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 555                 if (error)
 556                         break;
 557                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 558                 dir++;
 559         }
 560         flush_tlb_range(current->mm, beg, end);
 561         return error;
 562 }
 563
 564 /*
 565  * sanity-check function..
 566  */
 567 static void put_page(pte_t * page_table, pte_t pte)
 568 {
 569         if (!pte_none(*page_table)) {
 570                 free_page_and_swap_cache(pte_page(pte));
 571                 return;
 572         }
 573 /* no need for flush_tlb */
 574         set_pte(page_table, pte);
 575 }
 576
 577 /*
 578  * This routine is used to map in a page into an address space: needed by
 579  * execve() for the initial stack and environment pages.
 580  */
 581 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 582 {
 583         pgd_t * pgd;
 584         pmd_t * pmd;
 585         pte_t * pte;
 586
 587         if (MAP_NR(page) >= max_mapnr)
 588                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 589         if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
 590                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 591         pgd = pgd_offset(tsk->mm,address);
 592         pmd = pmd_alloc(pgd, address);
 593         if (!pmd) {
 594                 free_page(page);
 595                 oom(tsk);
 596                 return 0;
 597         }
 598         pte = pte_alloc(pmd, address);
 599         if (!pte) {
 600                 free_page(page);
 601                 oom(tsk);
 602                 return 0;
 603         }
 604         if (!pte_none(*pte)) {
 605                 printk("put_dirty_page: page already exists\n");
 606                 free_page(page);
 607                 return 0;
 608         }
 609         flush_page_to_ram(page);
 610         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 611 /* no need for flush_tlb */
 612         return page;
 613 }
 614
 615 /*
 616  * This routine handles present pages, when users try to write
 617  * to a shared page. It is done by copying the page to a new address
 618  * and decrementing the shared-page counter for the old page.
 619  *
 620  * Goto-purists beware: the only reason for goto's here is that it results
 621  * in better assembly code.. The "default" path will see no jumps at all.
 622  *
 623  * Note that this routine assumes that the protection checks have been
 624  * done by the caller (the low-level page fault routine in most cases).
 625  * Thus we can safely just mark it writable once we've done any necessary
 626  * COW.
 627  *
 628  * We also mark the page dirty at this point even though the page will
 629  * change only once the write actually happens. This avoids a few races,
 630  * and potentially makes it more efficient.
 631  */
 632 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 633         unsigned long address, pte_t *page_table)
 634 {
 635         pte_t pte;
 636         unsigned long old_page, new_page;
 637         struct page * page_map;
 638
 639         pte = *page_table;
 640         new_page = __get_free_page(GFP_USER);
 641         /* Did someone else copy this page for us while we slept? */
 642         if (pte_val(*page_table) != pte_val(pte))
 643                 goto end_wp_page;
 644         if (!pte_present(pte))
 645                 goto end_wp_page;
 646         if (pte_write(pte))
 647                 goto end_wp_page;
 648         old_page = pte_page(pte);
 649         if (MAP_NR(old_page) >= max_mapnr)
 650                 goto bad_wp_page;
 651         tsk->min_flt++;
 652         page_map = mem_map + MAP_NR(old_page);
 653
 654         /*
 655          * Do we need to copy?
 656          */
 657         if (is_page_shared(page_map)) {
 658                 if (new_page) {
 659                         if (PageReserved(mem_map + MAP_NR(old_page)))
 660                                 ++vma->vm_mm->rss;
 661                         copy_cow_page(old_page,new_page);
 662                         flush_page_to_ram(old_page);
 663                         flush_page_to_ram(new_page);
 664                         flush_cache_page(vma, address);
 665                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 666                         free_page(old_page);
 667                         flush_tlb_page(vma, address);
 668                         return 1;
 669                 }
 670                 flush_cache_page(vma, address);
 671                 set_pte(page_table, BAD_PAGE);
 672                 flush_tlb_page(vma, address);
 673                 free_page(old_page);
 674                 oom(tsk);
 675                 return 0;
 676         }
 677         if (PageSwapCache(page_map))
 678                 delete_from_swap_cache(page_map);
 679         flush_cache_page(vma, address);
 680         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 681         flush_tlb_page(vma, address);
 682 end_wp_page:
 683         if (new_page)
 684                 free_page(new_page);
 685         return 1;
 686
 687 bad_wp_page:
 688         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 689         send_sig(SIGKILL, tsk, 1);
 690         if (new_page)
 691                 free_page(new_page);
 692         return 0;
 693 }
 694
 695 /*
 696  * This function zeroes out partial mmap'ed pages at truncation time..
 697  */
 698 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 699 {
 700         pgd_t *page_dir;
 701         pmd_t *page_middle;
 702         pte_t *page_table, pte;
 703
 704         page_dir = pgd_offset(vma->vm_mm, address);
 705         if (pgd_none(*page_dir))
 706                 return;
 707         if (pgd_bad(*page_dir)) {
 708                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 709                 pgd_clear(page_dir);
 710                 return;
 711         }
 712         page_middle = pmd_offset(page_dir, address);
 713         if (pmd_none(*page_middle))
 714                 return;
 715         if (pmd_bad(*page_middle)) {
 716                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 717                 pmd_clear(page_middle);
 718                 return;
 719         }
 720         page_table = pte_offset(page_middle, address);
 721         pte = *page_table;
 722         if (!pte_present(pte))
 723                 return;
 724         flush_cache_page(vma, address);
 725         address &= ~PAGE_MASK;
 726         address += pte_page(pte);
 727         if (MAP_NR(address) >= max_mapnr)
 728                 return;
 729         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 730         flush_page_to_ram(pte_page(pte));
 731 }
 732
 733 /*
 734  * Handle all mappings that got truncated by a "truncate()"
 735  * system call.
 736  *
 737  * NOTE! We have to be ready to update the memory sharing
 738  * between the file and the memory map for a potential last
 739  * incomplete page.  Ugly, but necessary.
 740  */
 741 void vmtruncate(struct inode * inode, unsigned long offset)
 742 {
 743         struct vm_area_struct * mpnt;
 744
 745         truncate_inode_pages(inode, offset);
 746         if (!inode->i_mmap)
 747                 return;
 748         mpnt = inode->i_mmap;
 749         do {
 750                 struct mm_struct *mm = mpnt->vm_mm;
 751                 unsigned long start = mpnt->vm_start;
 752                 unsigned long end = mpnt->vm_end;
 753                 unsigned long len = end - start;
 754                 unsigned long diff;
 755
 756                 /* mapping wholly truncated? */
 757                 if (mpnt->vm_offset >= offset) {
 758                         flush_cache_range(mm, start, end);
 759                         zap_page_range(mm, start, len);
 760                         flush_tlb_range(mm, start, end);
 761                         continue;
 762                 }
 763                 /* mapping wholly unaffected? */
 764                 diff = offset - mpnt->vm_offset;
 765                 if (diff >= len)
 766                         continue;
 767                 /* Ok, partially affected.. */
 768                 start += diff;
 769                 len = (len - diff) & PAGE_MASK;
 770                 if (start & ~PAGE_MASK) {
 771                         partial_clear(mpnt, start);
 772                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 773                 }
 774                 flush_cache_range(mm, start, end);
 775                 zap_page_range(mm, start, len);
 776                 flush_tlb_range(mm, start, end);
 777         } while ((mpnt = mpnt->vm_next_share) != NULL);
 778 }
 779
 780
 781 static int do_swap_page(struct task_struct * tsk,
 782         struct vm_area_struct * vma, unsigned long address,
 783         pte_t * page_table, pte_t entry, int write_access)
 784 {
 785         lock_kernel();
 786         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 787                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 788                 flush_page_to_ram(pte_page(*page_table));
 789         } else {
 790                 pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 791                 if (pte_val(*page_table) != pte_val(entry)) {
 792                         free_page(pte_page(page));
 793                 } else {
 794                         if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
 795                             !(vma->vm_flags & VM_SHARED))
 796                                 page = pte_wrprotect(page);
 797                         ++vma->vm_mm->rss;
 798                         ++tsk->maj_flt;
 799                         flush_page_to_ram(pte_page(page));
 800                         set_pte(page_table, page);
 801                 }
 802         }
 803         unlock_kernel();
 804         return 1;
 805 }
 806
 807 /*
 808  * This only needs the MM semaphore
 809  */
 810 static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access)
 811 {
 812         pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 813         if (write_access) {
 814                 unsigned long page = __get_free_page(GFP_USER);
 815                 if (!page)
 816                         return 0;
 817                 clear_page(page);
 818                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 819                 vma->vm_mm->rss++;
 820                 tsk->min_flt++;
 821                 flush_page_to_ram(page);
 822         }
 823         put_page(page_table, entry);
 824         return 1;
 825 }
 826
 827 /*
 828  * do_no_page() tries to create a new page mapping. It aggressively
 829  * tries to share with existing pages, but makes a separate copy if
 830  * the "write_access" parameter is true in order to avoid the next
 831  * page fault.
 832  *
 833  * As this is called only for pages that do not currently exist, we
 834  * do not need to flush old virtual caches or the TLB.
 835  *
 836  * This is called with the MM semaphore held, but without the kernel
 837  * lock.
 838  */
 839 static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 840         unsigned long address, int write_access, pte_t *page_table)
 841 {
 842         unsigned long page;
 843         pte_t entry;
 844
 845         if (!vma->vm_ops || !vma->vm_ops->nopage)
 846                 return do_anonymous_page(tsk, vma, page_table, write_access);
 847
 848         /*
 849          * The third argument is "no_share", which tells the low-level code
 850          * to copy, not share the page even if sharing is possible.  It's
 851          * essentially an early COW detection.
 852          *
 853          * We need to grab the kernel lock for this..
 854          */
 855         lock_kernel();
 856         page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
 857                 (vma->vm_flags & VM_SHARED)?0:write_access);
 858         unlock_kernel();
 859         if (!page)
 860                 return 0;
 861
 862         ++tsk->maj_flt;
 863         ++vma->vm_mm->rss;
 864         /*
 865          * This silly early PAGE_DIRTY setting removes a race
 866          * due to the bad i386 page protection. But it's valid
 867          * for other architectures too.
 868          *
 869          * Note that if write_access is true, we either now have
 870          * an exclusive copy of the page, or this is a shared mapping,
 871          * so we can make it writable and dirty to avoid having to
 872          * handle that later.
 873          */
 874         flush_page_to_ram(page);
 875         entry = mk_pte(page, vma->vm_page_prot);
 876         if (write_access) {
 877                 entry = pte_mkwrite(pte_mkdirty(entry));
 878         } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
 879                    !(vma->vm_flags & VM_SHARED))
 880                 entry = pte_wrprotect(entry);
 881         put_page(page_table, entry);
 882         /* no need to invalidate: a not-present page shouldn't be cached */
 883         return 1;
 884 }
 885
 886 /*
 887  * These routines also need to handle stuff like marking pages dirty
 888  * and/or accessed for architectures that don't do it in hardware (most
 889  * RISC architectures).  The early dirtying is also good on the i386.
 890  *
 891  * There is also a hook called "update_mmu_cache()" that architectures
 892  * with external mmu caches can use to update those (ie the Sparc or
 893  * PowerPC hashed page tables that act as extended TLBs).
 894  */
 895 static inline int handle_pte_fault(struct task_struct *tsk,
 896         struct vm_area_struct * vma, unsigned long address,
 897         int write_access, pte_t * pte)
 898 {
 899         pte_t entry = *pte;
 900
 901         if (!pte_present(entry)) {
 902                 if (pte_none(entry))
 903                         return do_no_page(tsk, vma, address, write_access, pte);
 904                 return do_swap_page(tsk, vma, address, pte, entry, write_access);
 905         }
 906
 907         entry = pte_mkyoung(entry);
 908         set_pte(pte, entry);
 909         flush_tlb_page(vma, address);
 910         if (!write_access)
 911                 return 1;
 912
 913         if (pte_write(entry)) {
 914                 entry = pte_mkdirty(entry);
 915                 set_pte(pte, entry);
 916                 flush_tlb_page(vma, address);
 917                 return 1;
 918         }
 919         return do_wp_page(tsk, vma, address, pte);
 920 }
 921
 922 /*
 923  * By the time we get here, we already hold the mm semaphore
 924  */
 925 int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 926         unsigned long address, int write_access)
 927 {
 928         pgd_t *pgd;
 929         pmd_t *pmd;
 930
 931         pgd = pgd_offset(vma->vm_mm, address);
 932         pmd = pmd_alloc(pgd, address);
 933         if (pmd) {
 934                 pte_t * pte = pte_alloc(pmd, address);
 935                 if (pte) {
 936                         if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
 937                                 update_mmu_cache(vma, address, *pte);
 938                                 return 1;
 939                         }
 940                 }
 941         }
 942         return 0;
 943 }
 944
 945 /*
 946  * Simplistic page force-in..
 947  */
 948 void make_pages_present(unsigned long addr, unsigned long end)
 949 {
 950         int write;
 951         struct vm_area_struct * vma;
 952
 953         vma = find_vma(current->mm, addr);
 954         write = (vma->vm_flags & VM_WRITE) != 0;
 955         while (addr < end) {
 956                 handle_mm_fault(current, vma, addr, write);
 957                 addr += PAGE_SIZE;
 958         }
 959 }