mm/memory.c

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35
  36 #include <linux/mm.h>
  37 #include <linux/mman.h>
  38 #include <linux/swap.h>
  39 #include <linux/pagemap.h>
  40 #include <linux/smp_lock.h>
  41 #include <linux/swapctl.h>
  42 #include <linux/iobuf.h>
  43
  44 #include <asm/uaccess.h>
  45 #include <asm/pgtable.h>
  46
  47 unsigned long max_mapnr = 0;
  48 unsigned long num_physpages = 0;
  49 void * high_memory = NULL;
  50
  51 /*
  52  * We special-case the C-O-W ZERO_PAGE, because it's such
  53  * a common occurrence (no need to read the page to know
  54  * that it's zero - better for the cache and memory subsystem).
  55  */
  56 static inline void copy_cow_page(unsigned long from, unsigned long to)
  57 {
  58         if (from == ZERO_PAGE(to)) {
  59                 clear_page(to);
  60                 return;
  61         }
  62         copy_page(to, from);
  63 }
  64
  65 mem_map_t * mem_map = NULL;
  66
  67 /*
  68  * oom() prints a message (so that the user knows why the process died),
  69  * and gives the process an untrappable SIGKILL.
  70  */
  71 void oom(struct task_struct * task)
  72 {
  73         printk("\nOut of memory for %s.\n", task->comm);
  74         force_sig(SIGKILL, task);
  75 }
  76
  77 /*
  78  * Note: this doesn't free the actual pages themselves. That
  79  * has been handled earlier when unmapping all the memory regions.
  80  */
  81 static inline void free_one_pmd(pmd_t * dir)
  82 {
  83         pte_t * pte;
  84
  85         if (pmd_none(*dir))
  86                 return;
  87         if (pmd_bad(*dir)) {
  88                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  89                 pmd_clear(dir);
  90                 return;
  91         }
  92         pte = pte_offset(dir, 0);
  93         pmd_clear(dir);
  94         pte_free(pte);
  95 }
  96
  97 static inline void free_one_pgd(pgd_t * dir)
  98 {
  99         int j;
 100         pmd_t * pmd;
 101
 102         if (pgd_none(*dir))
 103                 return;
 104         if (pgd_bad(*dir)) {
 105                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 106                 pgd_clear(dir);
 107                 return;
 108         }
 109         pmd = pmd_offset(dir, 0);
 110         pgd_clear(dir);
 111         for (j = 0; j < PTRS_PER_PMD ; j++)
 112                 free_one_pmd(pmd+j);
 113         pmd_free(pmd);
 114 }
 115
 116 /* Low and high watermarks for page table cache.
 117    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
 118  */
 119 int pgt_cache_water[2] = { 25, 50 };
 120
 121 /* Returns the number of pages freed */
 122 int check_pgt_cache(void)
 123 {
 124         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 125 }
 126
 127
 128 /*
 129  * This function clears all user-level page tables of a process - this
 130  * is needed by execve(), so that old pages aren't in the way.
 131  */
 132 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 133 {
 134         pgd_t * page_dir = mm->pgd;
 135
 136         page_dir += first;
 137         do {
 138                 free_one_pgd(page_dir);
 139                 page_dir++;
 140         } while (--nr);
 141
 142         /* keep the page table cache within bounds */
 143         check_pgt_cache();
 144 }
 145
 146 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 147 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 148
 149 /*
 150  * copy one vm_area from one task to the other. Assumes the page tables
 151  * already present in the new task to be cleared in the whole range
 152  * covered by this vma.
 153  *
 154  * 08Jan98 Merged into one routine from several inline routines to reduce
 155  *         variable count and make things faster. -jj
 156  */
 157 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 158                         struct vm_area_struct *vma)
 159 {
 160         pgd_t * src_pgd, * dst_pgd;
 161         unsigned long address = vma->vm_start;
 162         unsigned long end = vma->vm_end;
 163         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 164
 165         src_pgd = pgd_offset(src, address)-1;
 166         dst_pgd = pgd_offset(dst, address)-1;
 167
 168         for (;;) {
 169                 pmd_t * src_pmd, * dst_pmd;
 170
 171                 src_pgd++; dst_pgd++;
 172
 173                 /* copy_pmd_range */
 174
 175                 if (pgd_none(*src_pgd))
 176                         goto skip_copy_pmd_range;
 177                 if (pgd_bad(*src_pgd)) {
 178                         printk("copy_pmd_range: bad pgd (%08lx)\n",
 179                                 pgd_val(*src_pgd));
 180                         pgd_clear(src_pgd);
 181 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 182                         if (address >= end)
 183                                 goto out;
 184                         continue;
 185                 }
 186                 if (pgd_none(*dst_pgd)) {
 187                         if (!pmd_alloc(dst_pgd, 0))
 188                                 goto nomem;
 189                 }
 190
 191                 src_pmd = pmd_offset(src_pgd, address);
 192                 dst_pmd = pmd_offset(dst_pgd, address);
 193
 194                 do {
 195                         pte_t * src_pte, * dst_pte;
 196
 197                         /* copy_pte_range */
 198
 199                         if (pmd_none(*src_pmd))
 200                                 goto skip_copy_pte_range;
 201                         if (pmd_bad(*src_pmd)) {
 202                                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 203                                 pmd_clear(src_pmd);
 204 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 205                                 if (address >= end)
 206                                         goto out;
 207                                 goto cont_copy_pmd_range;
 208                         }
 209                         if (pmd_none(*dst_pmd)) {
 210                                 if (!pte_alloc(dst_pmd, 0))
 211                                         goto nomem;
 212                         }
 213
 214                         src_pte = pte_offset(src_pmd, address);
 215                         dst_pte = pte_offset(dst_pmd, address);
 216
 217                         do {
 218                                 pte_t pte = *src_pte;
 219                                 unsigned long page_nr;
 220
 221                                 /* copy_one_pte */
 222
 223                                 if (pte_none(pte))
 224                                         goto cont_copy_pte_range;
 225                                 if (!pte_present(pte)) {
 226                                         swap_duplicate(pte_val(pte));
 227                                         set_pte(dst_pte, pte);
 228                                         goto cont_copy_pte_range;
 229                                 }
 230                                 page_nr = MAP_NR(pte_page(pte));
 231                                 if (page_nr >= max_mapnr ||
 232                                     PageReserved(mem_map+page_nr)) {
 233                                         set_pte(dst_pte, pte);
 234                                         goto cont_copy_pte_range;
 235                                 }
 236                                 /* If it's a COW mapping, write protect it both in the parent and the child */
 237                                 if (cow) {
 238                                         pte = pte_wrprotect(pte);
 239                                         set_pte(src_pte, pte);
 240                                 }
 241                                 /* If it's a shared mapping, mark it clean in the child */
 242                                 if (vma->vm_flags & VM_SHARED)
 243                                         pte = pte_mkclean(pte);
 244                                 set_pte(dst_pte, pte_mkold(pte));
 245                                 get_page(mem_map + page_nr);
 246
 247 cont_copy_pte_range:            address += PAGE_SIZE;
 248                                 if (address >= end)
 249                                         goto out;
 250                                 src_pte++;
 251                                 dst_pte++;
 252                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 253
 254 cont_copy_pmd_range:    src_pmd++;
 255                         dst_pmd++;
 256                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 257         }
 258 out:
 259         return 0;
 260
 261 nomem:
 262         return -ENOMEM;
 263 }
 264
 265 /*
 266  * Return indicates whether a page was freed so caller can adjust rss
 267  */
 268 static inline int free_pte(pte_t page)
 269 {
 270         if (pte_present(page)) {
 271                 unsigned long addr = pte_page(page);
 272                 if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 273                         return 0;
 274                 /*
 275                  * free_page() used to be able to clear swap cache
 276                  * entries.  We may now have to do it manually.
 277                  */
 278                 free_page_and_swap_cache(addr);
 279                 return 1;
 280         }
 281         swap_free(pte_val(page));
 282         return 0;
 283 }
 284
 285 static inline void forget_pte(pte_t page)
 286 {
 287         if (!pte_none(page)) {
 288                 printk("forget_pte: old mapping existed!\n");
 289                 free_pte(page);
 290         }
 291 }
 292
 293 static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size)
 294 {
 295         pte_t * pte;
 296         int freed;
 297
 298         if (pmd_none(*pmd))
 299                 return 0;
 300         if (pmd_bad(*pmd)) {
 301                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 302                 pmd_clear(pmd);
 303                 return 0;
 304         }
 305         pte = pte_offset(pmd, address);
 306         address &= ~PMD_MASK;
 307         if (address + size > PMD_SIZE)
 308                 size = PMD_SIZE - address;
 309         size >>= PAGE_SHIFT;
 310         freed = 0;
 311         for (;;) {
 312                 pte_t page;
 313                 if (!size)
 314                         break;
 315                 page = *pte;
 316                 pte++;
 317                 size--;
 318                 pte_clear(pte-1);
 319                 if (pte_none(page))
 320                         continue;
 321                 freed += free_pte(page);
 322         }
 323         return freed;
 324 }
 325
 326 static inline int zap_pmd_range(struct mm_struct *mm, pgd_t * dir, unsigned long address, unsigned long size)
 327 {
 328         pmd_t * pmd;
 329         unsigned long end;
 330         int freed;
 331
 332         if (pgd_none(*dir))
 333                 return 0;
 334         if (pgd_bad(*dir)) {
 335                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 336                 pgd_clear(dir);
 337                 return 0;
 338         }
 339         pmd = pmd_offset(dir, address);
 340         address &= ~PGDIR_MASK;
 341         end = address + size;
 342         if (end > PGDIR_SIZE)
 343                 end = PGDIR_SIZE;
 344         freed = 0;
 345         do {
 346                 freed += zap_pte_range(mm, pmd, address, end - address);
 347                 address = (address + PMD_SIZE) & PMD_MASK;
 348                 pmd++;
 349         } while (address < end);
 350         return freed;
 351 }
 352
 353 /*
 354  * remove user pages in a given range.
 355  */
 356 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 357 {
 358         pgd_t * dir;
 359         unsigned long end = address + size;
 360         int freed = 0;
 361
 362         dir = pgd_offset(mm, address);
 363
 364         /*
 365          * This is a long-lived spinlock. That's fine.
 366          * There's no contention, because the page table
 367          * lock only protects against kswapd anyway, and
 368          * even if kswapd happened to be looking at this
 369          * process we _want_ it to get stuck.
 370          */
 371         spin_lock(&mm->page_table_lock);
 372         while (address < end) {
 373                 freed += zap_pmd_range(mm, dir, address, end - address);
 374                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 375                 dir++;
 376         }
 377         spin_unlock(&mm->page_table_lock);
 378         /*
 379          * Update rss for the mm_struct (not necessarily current->mm)
 380          */
 381         if (mm->rss > 0) {
 382                 mm->rss -= freed;
 383                 if (mm->rss < 0)
 384                         mm->rss = 0;
 385         }
 386 }
 387
 388
 389 /*
 390  * Do a quick page-table lookup for a single page.
 391  */
 392 static unsigned long follow_page(unsigned long address)
 393 {
 394         pgd_t *pgd;
 395         pmd_t *pmd;
 396
 397         pgd = pgd_offset(current->mm, address);
 398         pmd = pmd_offset(pgd, address);
 399         if (pmd) {
 400                 pte_t * pte = pte_offset(pmd, address);
 401                 if (pte && pte_present(*pte)) {
 402                         return pte_page(*pte);
 403                 }
 404         }
 405
 406         printk(KERN_ERR "Missing page in follow_page\n");
 407         return 0;
 408 }
 409
 410 /*
 411  * Given a physical address, is there a useful struct page pointing to it?
 412  */
 413
 414 static struct page * get_page_map(unsigned long page)
 415 {
 416         struct page *map;
 417
 418         if (MAP_NR(page) >= max_mapnr)
 419                 return 0;
 420         if (page == ZERO_PAGE(page))
 421                 return 0;
 422         map = mem_map + MAP_NR(page);
 423         if (PageReserved(map))
 424                 return 0;
 425         return map;
 426 }
 427
 428 /*
 429  * Force in an entire range of pages from the current process's user VA,
 430  * and pin and lock the pages for IO.
 431  */
 432
 433 #define dprintk(x...)
 434 int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
 435 {
 436         unsigned long           ptr, end;
 437         int                     err;
 438         struct mm_struct *      mm;
 439         struct vm_area_struct * vma = 0;
 440         unsigned long           page;
 441         struct page *           map;
 442         int                     doublepage = 0;
 443         int                     repeat = 0;
 444         int                     i;
 445
 446         /* Make sure the iobuf is not already mapped somewhere. */
 447         if (iobuf->nr_pages)
 448                 return -EINVAL;
 449
 450         mm = current->mm;
 451         dprintk ("map_user_kiobuf: begin\n");
 452
 453         ptr = va & PAGE_MASK;
 454         end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
 455         err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
 456         if (err)
 457                 return err;
 458
 459  repeat:
 460         down(&mm->mmap_sem);
 461
 462         err = -EFAULT;
 463         iobuf->locked = 1;
 464         iobuf->offset = va & ~PAGE_MASK;
 465         iobuf->length = len;
 466
 467         i = 0;
 468
 469         /*
 470          * First of all, try to fault in all of the necessary pages
 471          */
 472         while (ptr < end) {
 473                 if (!vma || ptr >= vma->vm_end) {
 474                         vma = find_vma(current->mm, ptr);
 475                         if (!vma)
 476                                 goto out_unlock;
 477                 }
 478                 if (handle_mm_fault(current, vma, ptr, (rw==READ)) <= 0)
 479                         goto out_unlock;
 480                 spin_lock(&mm->page_table_lock);
 481                 page = follow_page(ptr);
 482                 if (!page) {
 483                         dprintk (KERN_ERR "Missing page in map_user_kiobuf\n");
 484                         map = NULL;
 485                         goto retry;
 486                 }
 487                 map = get_page_map(page);
 488                 if (map) {
 489                         if (TryLockPage(map)) {
 490                                 goto retry;
 491                         }
 492                         atomic_inc(&map->count);
 493                 }
 494                 spin_unlock(&mm->page_table_lock);
 495                 dprintk ("Installing page %p %p: %d\n", (void *)page, map, i);
 496                 iobuf->pagelist[i] = page;
 497                 iobuf->maplist[i] = map;
 498                 iobuf->nr_pages = ++i;
 499
 500                 ptr += PAGE_SIZE;
 501         }
 502
 503         up(&mm->mmap_sem);
 504         dprintk ("map_user_kiobuf: end OK\n");
 505         return 0;
 506
 507  out_unlock:
 508         up(&mm->mmap_sem);
 509         unmap_kiobuf(iobuf);
 510         dprintk ("map_user_kiobuf: end %d\n", err);
 511         return err;
 512
 513  retry:
 514
 515         /*
 516          * Undo the locking so far, wait on the page we got to, and try again.
 517          */
 518         spin_unlock(&mm->page_table_lock);
 519         unmap_kiobuf(iobuf);
 520         up(&mm->mmap_sem);
 521
 522         /*
 523          * Did the release also unlock the page we got stuck on?
 524          */
 525         if (map) {
 526                 if (!PageLocked(map)) {
 527                         /* If so, we may well have the page mapped twice
 528                          * in the IO address range.  Bad news.  Of
 529                          * course, it _might_ * just be a coincidence,
 530                          * but if it happens more than * once, chances
 531                          * are we have a double-mapped page. */
 532                         if (++doublepage >= 3) {
 533                                 return -EINVAL;
 534                         }
 535                 }
 536
 537                 /*
 538                  * Try again...
 539                  */
 540                 wait_on_page(map);
 541         }
 542
 543         if (++repeat < 16)
 544                 goto repeat;
 545         return -EAGAIN;
 546 }
 547
 548
 549 /*
 550  * Unmap all of the pages referenced by a kiobuf.  We release the pages,
 551  * and unlock them if they were locked.
 552  */
 553
 554 void unmap_kiobuf (struct kiobuf *iobuf)
 555 {
 556         int i;
 557         struct page *map;
 558
 559         for (i = 0; i < iobuf->nr_pages; i++) {
 560                 map = iobuf->maplist[i];
 561
 562                 if (map && iobuf->locked) {
 563                         __free_page(map);
 564                         UnlockPage(map);
 565                 }
 566         }
 567
 568         iobuf->nr_pages = 0;
 569         iobuf->locked = 0;
 570 }
 571
 572 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
 573                                      unsigned long size, pgprot_t prot)
 574 {
 575         unsigned long end;
 576
 577         address &= ~PMD_MASK;
 578         end = address + size;
 579         if (end > PMD_SIZE)
 580                 end = PMD_SIZE;
 581         do {
 582                 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address),
 583                                                prot));
 584                 pte_t oldpage = *pte;
 585                 set_pte(pte, zero_pte);
 586                 forget_pte(oldpage);
 587                 address += PAGE_SIZE;
 588                 pte++;
 589         } while (address < end);
 590 }
 591
 592 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
 593                                     unsigned long size, pgprot_t prot)
 594 {
 595         unsigned long end;
 596
 597         address &= ~PGDIR_MASK;
 598         end = address + size;
 599         if (end > PGDIR_SIZE)
 600                 end = PGDIR_SIZE;
 601         do {
 602                 pte_t * pte = pte_alloc(pmd, address);
 603                 if (!pte)
 604                         return -ENOMEM;
 605                 zeromap_pte_range(pte, address, end - address, prot);
 606                 address = (address + PMD_SIZE) & PMD_MASK;
 607                 pmd++;
 608         } while (address < end);
 609         return 0;
 610 }
 611
 612 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 613 {
 614         int error = 0;
 615         pgd_t * dir;
 616         unsigned long beg = address;
 617         unsigned long end = address + size;
 618
 619         dir = pgd_offset(current->mm, address);
 620         flush_cache_range(current->mm, beg, end);
 621         while (address < end) {
 622                 pmd_t *pmd = pmd_alloc(dir, address);
 623                 error = -ENOMEM;
 624                 if (!pmd)
 625                         break;
 626                 error = zeromap_pmd_range(pmd, address, end - address, prot);
 627                 if (error)
 628                         break;
 629                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 630                 dir++;
 631         }
 632         flush_tlb_range(current->mm, beg, end);
 633         return error;
 634 }
 635
 636 /*
 637  * maps a range of physical memory into the requested pages. the old
 638  * mappings are removed. any references to nonexistent pages results
 639  * in null mappings (currently treated as "copy-on-access")
 640  */
 641 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 642         unsigned long phys_addr, pgprot_t prot)
 643 {
 644         unsigned long end;
 645
 646         address &= ~PMD_MASK;
 647         end = address + size;
 648         if (end > PMD_SIZE)
 649                 end = PMD_SIZE;
 650         do {
 651                 unsigned long mapnr;
 652                 pte_t oldpage = *pte;
 653                 pte_clear(pte);
 654
 655                 mapnr = MAP_NR(__va(phys_addr));
 656                 if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 657                         set_pte(pte, mk_pte_phys(phys_addr, prot));
 658                 forget_pte(oldpage);
 659                 address += PAGE_SIZE;
 660                 phys_addr += PAGE_SIZE;
 661                 pte++;
 662         } while (address < end);
 663 }
 664
 665 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 666         unsigned long phys_addr, pgprot_t prot)
 667 {
 668         unsigned long end;
 669
 670         address &= ~PGDIR_MASK;
 671         end = address + size;
 672         if (end > PGDIR_SIZE)
 673                 end = PGDIR_SIZE;
 674         phys_addr -= address;
 675         do {
 676                 pte_t * pte = pte_alloc(pmd, address);
 677                 if (!pte)
 678                         return -ENOMEM;
 679                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 680                 address = (address + PMD_SIZE) & PMD_MASK;
 681                 pmd++;
 682         } while (address < end);
 683         return 0;
 684 }
 685
 686 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 687 {
 688         int error = 0;
 689         pgd_t * dir;
 690         unsigned long beg = from;
 691         unsigned long end = from + size;
 692
 693         phys_addr -= from;
 694         dir = pgd_offset(current->mm, from);
 695         flush_cache_range(current->mm, beg, end);
 696         while (from < end) {
 697                 pmd_t *pmd = pmd_alloc(dir, from);
 698                 error = -ENOMEM;
 699                 if (!pmd)
 700                         break;
 701                 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 702                 if (error)
 703                         break;
 704                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 705                 dir++;
 706         }
 707         flush_tlb_range(current->mm, beg, end);
 708         return error;
 709 }
 710
 711 /*
 712  * This routine is used to map in a page into an address space: needed by
 713  * execve() for the initial stack and environment pages.
 714  */
 715 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 716 {
 717         pgd_t * pgd;
 718         pmd_t * pmd;
 719         pte_t * pte;
 720
 721         if (MAP_NR(page) >= max_mapnr)
 722                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 723         if (page_count(mem_map + MAP_NR(page)) != 1)
 724                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 725         pgd = pgd_offset(tsk->mm,address);
 726         pmd = pmd_alloc(pgd, address);
 727         if (!pmd) {
 728                 free_page(page);
 729                 oom(tsk);
 730                 return 0;
 731         }
 732         pte = pte_alloc(pmd, address);
 733         if (!pte) {
 734                 free_page(page);
 735                 oom(tsk);
 736                 return 0;
 737         }
 738         if (!pte_none(*pte)) {
 739                 printk("put_dirty_page: pte %08lx already exists\n",
 740                        pte_val(*pte));
 741                 free_page(page);
 742                 return 0;
 743         }
 744         flush_page_to_ram(page);
 745         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 746 /* no need for flush_tlb */
 747         return page;
 748 }
 749
 750 /*
 751  * This routine handles present pages, when users try to write
 752  * to a shared page. It is done by copying the page to a new address
 753  * and decrementing the shared-page counter for the old page.
 754  *
 755  * Goto-purists beware: the only reason for goto's here is that it results
 756  * in better assembly code.. The "default" path will see no jumps at all.
 757  *
 758  * Note that this routine assumes that the protection checks have been
 759  * done by the caller (the low-level page fault routine in most cases).
 760  * Thus we can safely just mark it writable once we've done any necessary
 761  * COW.
 762  *
 763  * We also mark the page dirty at this point even though the page will
 764  * change only once the write actually happens. This avoids a few races,
 765  * and potentially makes it more efficient.
 766  *
 767  * We enter with the page table read-lock held, and need to exit without
 768  * it.
 769  */
 770 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 771         unsigned long address, pte_t *page_table, pte_t pte)
 772 {
 773         unsigned long old_page, new_page;
 774         struct page * page;
 775
 776         old_page = pte_page(pte);
 777         if (MAP_NR(old_page) >= max_mapnr)
 778                 goto bad_wp_page;
 779         tsk->min_flt++;
 780         page = mem_map + MAP_NR(old_page);
 781
 782         /*
 783          * We can avoid the copy if:
 784          * - we're the only user (count == 1)
 785          * - the only other user is the swap cache,
 786          *   and the only swap cache user is itself,
 787          *   in which case we can remove the page
 788          *   from the swap cache.
 789          */
 790         switch (page_count(page)) {
 791         case 2:
 792                 if (!PageSwapCache(page))
 793                         break;
 794                 if (swap_count(page->offset) != 1)
 795                         break;
 796                 delete_from_swap_cache(page);
 797                 /* FallThrough */
 798         case 1:
 799                 flush_cache_page(vma, address);
 800                 set_pte(page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 801                 flush_tlb_page(vma, address);
 802                 spin_unlock(&tsk->mm->page_table_lock);
 803                 return 1;
 804         }
 805
 806         /*
 807          * Ok, we need to copy. Oh, well..
 808          */
 809         spin_unlock(&tsk->mm->page_table_lock);
 810         new_page = __get_free_page(GFP_USER);
 811         if (!new_page)
 812                 return -1;
 813         spin_lock(&tsk->mm->page_table_lock);
 814
 815         /*
 816          * Re-check the pte - we dropped the lock
 817          */
 818         if (pte_val(*page_table) == pte_val(pte)) {
 819                 if (PageReserved(page))
 820                         ++vma->vm_mm->rss;
 821                 copy_cow_page(old_page,new_page);
 822                 flush_page_to_ram(old_page);
 823                 flush_page_to_ram(new_page);
 824                 flush_cache_page(vma, address);
 825                 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 826                 flush_tlb_page(vma, address);
 827
 828                 /* Free the old page.. */
 829                 new_page = old_page;
 830         }
 831         spin_unlock(&tsk->mm->page_table_lock);
 832         free_page(new_page);
 833         return 1;
 834
 835 bad_wp_page:
 836         spin_unlock(&tsk->mm->page_table_lock);
 837         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 838         return -1;
 839 }
 840
 841 /*
 842  * This function zeroes out partial mmap'ed pages at truncation time..
 843  */
 844 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 845 {
 846         pgd_t *page_dir;
 847         pmd_t *page_middle;
 848         pte_t *page_table, pte;
 849
 850         page_dir = pgd_offset(vma->vm_mm, address);
 851         if (pgd_none(*page_dir))
 852                 return;
 853         if (pgd_bad(*page_dir)) {
 854                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 855                 pgd_clear(page_dir);
 856                 return;
 857         }
 858         page_middle = pmd_offset(page_dir, address);
 859         if (pmd_none(*page_middle))
 860                 return;
 861         if (pmd_bad(*page_middle)) {
 862                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 863                 pmd_clear(page_middle);
 864                 return;
 865         }
 866         page_table = pte_offset(page_middle, address);
 867         pte = *page_table;
 868         if (!pte_present(pte))
 869                 return;
 870         flush_cache_page(vma, address);
 871         address &= ~PAGE_MASK;
 872         address += pte_page(pte);
 873         if (MAP_NR(address) >= max_mapnr)
 874                 return;
 875         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 876         flush_page_to_ram(pte_page(pte));
 877 }
 878
 879 /*
 880  * Handle all mappings that got truncated by a "truncate()"
 881  * system call.
 882  *
 883  * NOTE! We have to be ready to update the memory sharing
 884  * between the file and the memory map for a potential last
 885  * incomplete page.  Ugly, but necessary.
 886  */
 887 void vmtruncate(struct inode * inode, unsigned long offset)
 888 {
 889         struct vm_area_struct * mpnt;
 890
 891         truncate_inode_pages(inode, offset);
 892         spin_lock(&inode->i_shared_lock);
 893         if (!inode->i_mmap)
 894                 goto out_unlock;
 895         mpnt = inode->i_mmap;
 896         do {
 897                 struct mm_struct *mm = mpnt->vm_mm;
 898                 unsigned long start = mpnt->vm_start;
 899                 unsigned long end = mpnt->vm_end;
 900                 unsigned long len = end - start;
 901                 unsigned long diff;
 902
 903                 /* mapping wholly truncated? */
 904                 if (mpnt->vm_offset >= offset) {
 905                         flush_cache_range(mm, start, end);
 906                         zap_page_range(mm, start, len);
 907                         flush_tlb_range(mm, start, end);
 908                         continue;
 909                 }
 910                 /* mapping wholly unaffected? */
 911                 diff = offset - mpnt->vm_offset;
 912                 if (diff >= len)
 913                         continue;
 914                 /* Ok, partially affected.. */
 915                 start += diff;
 916                 len = (len - diff) & PAGE_MASK;
 917                 if (start & ~PAGE_MASK) {
 918                         partial_clear(mpnt, start);
 919                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 920                 }
 921                 flush_cache_range(mm, start, end);
 922                 zap_page_range(mm, start, len);
 923                 flush_tlb_range(mm, start, end);
 924         } while ((mpnt = mpnt->vm_next_share) != NULL);
 925 out_unlock:
 926         spin_unlock(&inode->i_shared_lock);
 927 }
 928
 929
 930
 931 /*
 932  * Primitive swap readahead code. We simply read an aligned block of
 933  * (1 << page_cluster) entries in the swap area. This method is chosen
 934  * because it doesn't cost us any seek time.  We also make sure to queue
 935  * the 'original' request together with the readahead ones...
 936  */
 937 void swapin_readahead(unsigned long entry)
 938 {
 939         int i;
 940         struct page *new_page;
 941         unsigned long offset = SWP_OFFSET(entry);
 942         struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 943
 944         offset = (offset >> page_cluster) << page_cluster;
 945
 946         i = 1 << page_cluster;
 947         do {
 948                 /* Don't read-ahead past the end of the swap area */
 949                 if (offset >= swapdev->max)
 950                         break;
 951                 /* Don't block on I/O for read-ahead */
 952                 if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
 953                         break;
 954                 /* Don't read in bad or busy pages */
 955                 if (!swapdev->swap_map[offset])
 956                         break;
 957                 if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
 958                         break;
 959
 960                 /* Ok, do the async read-ahead now */
 961                 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
 962                 if (new_page != NULL)
 963                         __free_page(new_page);
 964                 offset++;
 965         } while (--i);
 966         return;
 967 }
 968
 969 static int do_swap_page(struct task_struct * tsk,
 970         struct vm_area_struct * vma, unsigned long address,
 971         pte_t * page_table, unsigned long entry, int write_access)
 972 {
 973         struct page *page = lookup_swap_cache(entry);
 974         pte_t pte;
 975
 976         if (!page) {
 977                 lock_kernel();
 978                 swapin_readahead(entry);
 979                 page = read_swap_cache(entry);
 980                 unlock_kernel();
 981                 if (!page)
 982                         return -1;
 983
 984                 flush_page_to_ram(page_address(page));
 985         }
 986
 987         vma->vm_mm->rss++;
 988         tsk->min_flt++;
 989         lock_kernel();
 990         swap_free(entry);
 991         unlock_kernel();
 992
 993         pte = mk_pte(page_address(page), vma->vm_page_prot);
 994
 995         if (write_access && !is_page_shared(page)) {
 996                 delete_from_swap_cache(page);
 997                 pte = pte_mkwrite(pte_mkdirty(pte));
 998         }
 999         set_pte(page_table, pte);
1000         /* No need to invalidate - it was non-present before */
1001         update_mmu_cache(vma, address, pte);
1002         return 1;
1003 }
1004
1005 /*
1006  * This only needs the MM semaphore
1007  */
1008 static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
1009 {
1010         pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1011         if (write_access) {
1012                 unsigned long page = __get_free_page(GFP_USER);
1013                 if (!page)
1014                         return -1;
1015                 clear_page(page);
1016                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1017                 vma->vm_mm->rss++;
1018                 tsk->min_flt++;
1019                 flush_page_to_ram(page);
1020         }
1021         set_pte(page_table, entry);
1022         /* No need to invalidate - it was non-present before */
1023         update_mmu_cache(vma, addr, entry);
1024         return 1;
1025 }
1026
1027 /*
1028  * do_no_page() tries to create a new page mapping. It aggressively
1029  * tries to share with existing pages, but makes a separate copy if
1030  * the "write_access" parameter is true in order to avoid the next
1031  * page fault.
1032  *
1033  * As this is called only for pages that do not currently exist, we
1034  * do not need to flush old virtual caches or the TLB.
1035  *
1036  * This is called with the MM semaphore and the kernel lock held.
1037  * We need to release the kernel lock as soon as possible..
1038  */
1039 static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
1040         unsigned long address, int write_access, pte_t *page_table)
1041 {
1042         unsigned long page;
1043         pte_t entry;
1044
1045         if (!vma->vm_ops || !vma->vm_ops->nopage)
1046                 return do_anonymous_page(tsk, vma, page_table, write_access, address);
1047
1048         /*
1049          * The third argument is "no_share", which tells the low-level code
1050          * to copy, not share the page even if sharing is possible.  It's
1051          * essentially an early COW detection.
1052          */
1053         page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
1054         if (!page)
1055                 return 0;       /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
1056         if (page == -1)
1057                 return -1;      /* OOM */
1058
1059         ++tsk->maj_flt;
1060         ++vma->vm_mm->rss;
1061         /*
1062          * This silly early PAGE_DIRTY setting removes a race
1063          * due to the bad i386 page protection. But it's valid
1064          * for other architectures too.
1065          *
1066          * Note that if write_access is true, we either now have
1067          * an exclusive copy of the page, or this is a shared mapping,
1068          * so we can make it writable and dirty to avoid having to
1069          * handle that later.
1070          */
1071         flush_page_to_ram(page);
1072         entry = mk_pte(page, vma->vm_page_prot);
1073         if (write_access) {
1074                 entry = pte_mkwrite(pte_mkdirty(entry));
1075         } else if (page_count(mem_map+MAP_NR(page)) > 1 &&
1076                    !(vma->vm_flags & VM_SHARED))
1077                 entry = pte_wrprotect(entry);
1078         set_pte(page_table, entry);
1079         /* no need to invalidate: a not-present page shouldn't be cached */
1080         update_mmu_cache(vma, address, entry);
1081         return 1;
1082 }
1083
1084 /*
1085  * These routines also need to handle stuff like marking pages dirty
1086  * and/or accessed for architectures that don't do it in hardware (most
1087  * RISC architectures).  The early dirtying is also good on the i386.
1088  *
1089  * There is also a hook called "update_mmu_cache()" that architectures
1090  * with external mmu caches can use to update those (ie the Sparc or
1091  * PowerPC hashed page tables that act as extended TLBs).
1092  *
1093  * Note the "page_table_lock". It is to protect against kswapd removing
1094  * pages from under us. Note that kswapd only ever _removes_ pages, never
1095  * adds them. As such, once we have noticed that the page is not present,
1096  * we can drop the lock early.
1097  *
1098  * The adding of pages is protected by the MM semaphore (which we hold),
1099  * so we don't need to worry about a page being suddenly been added into
1100  * our VM.
1101  */
1102 static inline int handle_pte_fault(struct task_struct *tsk,
1103         struct vm_area_struct * vma, unsigned long address,
1104         int write_access, pte_t * pte)
1105 {
1106         pte_t entry;
1107
1108         entry = *pte;
1109         if (!pte_present(entry)) {
1110                 if (pte_none(entry))
1111                         return do_no_page(tsk, vma, address, write_access, pte);
1112                 return do_swap_page(tsk, vma, address, pte, pte_val(entry), write_access);
1113         }
1114
1115         /*
1116          * Ok, the entry was present, we need to get the page table
1117          * lock to synchronize with kswapd, and verify that the entry
1118          * didn't change from under us..
1119          */
1120         spin_lock(&tsk->mm->page_table_lock);
1121         if (pte_val(entry) == pte_val(*pte)) {
1122                 if (write_access) {
1123                         if (!pte_write(entry))
1124                                 return do_wp_page(tsk, vma, address, pte, entry);
1125
1126                         entry = pte_mkdirty(entry);
1127                 }
1128                 entry = pte_mkyoung(entry);
1129                 set_pte(pte, entry);
1130                 flush_tlb_page(vma, address);
1131                 update_mmu_cache(vma, address, entry);
1132         }
1133         spin_unlock(&tsk->mm->page_table_lock);
1134         return 1;
1135 }
1136
1137 /*
1138  * By the time we get here, we already hold the mm semaphore
1139  */
1140 int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
1141         unsigned long address, int write_access)
1142 {
1143         pgd_t *pgd;
1144         pmd_t *pmd;
1145
1146         pgd = pgd_offset(vma->vm_mm, address);
1147         pmd = pmd_alloc(pgd, address);
1148         if (pmd) {
1149                 pte_t * pte = pte_alloc(pmd, address);
1150                 if (pte)
1151                         return handle_pte_fault(tsk, vma, address, write_access, pte);
1152         }
1153         return -1;
1154 }
1155
1156 /*
1157  * Simplistic page force-in..
1158  */
1159 int make_pages_present(unsigned long addr, unsigned long end)
1160 {
1161         int write;
1162         struct task_struct *tsk = current;
1163         struct vm_area_struct * vma;
1164
1165         vma = find_vma(tsk->mm, addr);
1166         write = (vma->vm_flags & VM_WRITE) != 0;
1167         while (addr < end) {
1168                 if (handle_mm_fault(tsk, vma, addr, write) < 0)
1169                         return -1;
1170                 addr += PAGE_SIZE;
1171         }
1172         return 0;
1173 }