mm/mmap.c

   1 /*
   2  * mm/mmap.c
   3  *
   4  * Written by obz.
   5  *
   6  * Address space accounting code        <alan@redhat.com>
   7  */
   8
   9 #include <linux/slab.h>
  10 #include <linux/shm.h>
  11 #include <linux/mman.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/swap.h>
  14 #include <linux/init.h>
  15 #include <linux/file.h>
  16 #include <linux/fs.h>
  17 #include <linux/personality.h>
  18 #include <linux/security.h>
  19 #include <linux/hugetlb.h>
  20 #include <linux/profile.h>
  21
  22 #include <asm/uaccess.h>
  23 #include <asm/pgalloc.h>
  24 #include <asm/tlb.h>
  25
  26 /*
  27  * WARNING: the debugging will use recursive algorithms so never enable this
  28  * unless you know what you are doing.
  29  */
  30 #undef DEBUG_MM_RB
  31
  32 /* description of effects of mapping type and prot in current implementation.
  33  * this is due to the limited x86 page protection hardware.  The expected
  34  * behavior is in parens:
  35  *
  36  * map_type     prot
  37  *              PROT_NONE       PROT_READ       PROT_WRITE      PROT_EXEC
  38  * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  39  *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
  40  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  41  *
  42  * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  43  *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
  44  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  45  *
  46  */
  47 pgprot_t protection_map[16] = {
  48         __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
  49         __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
  50 };
  51
  52 int sysctl_overcommit_memory = 0;       /* default is heuristic overcommit */
  53 int sysctl_overcommit_ratio = 50;       /* default is 50% */
  54 atomic_t vm_committed_space = ATOMIC_INIT(0);
  55
  56 /*
  57  * Check that a process has enough memory to allocate a new virtual
  58  * mapping. 1 means there is enough memory for the allocation to
  59  * succeed and 0 implies there is not.
  60  *
  61  * We currently support three overcommit policies, which are set via the
  62  * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-acounting
  63  *
  64  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  65  * Additional code 2002 Jul 20 by Robert Love.
  66  */
  67 extern atomic_t slab_reclaim_pages;
  68 int vm_enough_memory(long pages)
  69 {
  70         unsigned long free, allowed;
  71
  72         vm_acct_memory(pages);
  73
  74         /*
  75          * Sometimes we want to use more memory than we have
  76          */
  77         if (sysctl_overcommit_memory == 1)
  78                 return 1;
  79
  80         if (sysctl_overcommit_memory == 0) {
  81                 free = get_page_cache_size();
  82                 free += nr_free_pages();
  83                 free += nr_swap_pages;
  84
  85                 /*
  86                  * Any slabs which are created with the
  87                  * SLAB_RECLAIM_ACCOUNT flag claim to have contents
  88                  * which are reclaimable, under pressure.  The dentry
  89                  * cache and most inode caches should fall into this
  90                  */
  91                 free += atomic_read(&slab_reclaim_pages);
  92
  93                 /*
  94                  * Leave the last 3% for root
  95                  */
  96                 if (!capable(CAP_SYS_ADMIN))
  97                         free -= free / 32;
  98
  99                 if (free > pages)
 100                         return 1;
 101                 vm_unacct_memory(pages);
 102                 return 0;
 103         }
 104
 105         allowed = totalram_pages * sysctl_overcommit_ratio / 100;
 106         allowed += total_swap_pages;
 107
 108         if (atomic_read(&vm_committed_space) < allowed)
 109                 return 1;
 110
 111         vm_unacct_memory(pages);
 112
 113         return 0;
 114 }
 115
 116 /*
 117  * Requires inode->i_mapping->i_shared_sem
 118  */
 119 static inline void
 120 __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
 121 {
 122         if (inode) {
 123                 if (vma->vm_flags & VM_DENYWRITE)
 124                         atomic_inc(&inode->i_writecount);
 125                 list_del_init(&vma->shared);
 126         }
 127 }
 128
 129 /*
 130  * Remove one vm structure from the inode's i_mapping address space.
 131  */
 132 static void remove_shared_vm_struct(struct vm_area_struct *vma)
 133 {
 134         struct file *file = vma->vm_file;
 135
 136         if (file) {
 137                 struct inode *inode = file->f_dentry->d_inode;
 138
 139                 down(&inode->i_mapping->i_shared_sem);
 140                 __remove_shared_vm_struct(vma, inode);
 141                 up(&inode->i_mapping->i_shared_sem);
 142         }
 143 }
 144
 145 /*
 146  *  sys_brk() for the most part doesn't need the global kernel
 147  *  lock, except when an application is doing something nasty
 148  *  like trying to un-brk an area that has already been mapped
 149  *  to a regular file.  in this case, the unmapping will need
 150  *  to invoke file system routines that need the global lock.
 151  */
 152 asmlinkage unsigned long sys_brk(unsigned long brk)
 153 {
 154         unsigned long rlim, retval;
 155         unsigned long newbrk, oldbrk;
 156         struct mm_struct *mm = current->mm;
 157
 158         down_write(&mm->mmap_sem);
 159
 160         if (brk < mm->end_code)
 161                 goto out;
 162         newbrk = PAGE_ALIGN(brk);
 163         oldbrk = PAGE_ALIGN(mm->brk);
 164         if (oldbrk == newbrk)
 165                 goto set_brk;
 166
 167         /* Always allow shrinking brk. */
 168         if (brk <= mm->brk) {
 169                 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
 170                         goto set_brk;
 171                 goto out;
 172         }
 173
 174         /* Check against rlimit.. */
 175         rlim = current->rlim[RLIMIT_DATA].rlim_cur;
 176         if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 177                 goto out;
 178
 179         /* Check against existing mmap mappings. */
 180         if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 181                 goto out;
 182
 183         /* Ok, looks good - let it rip. */
 184         if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
 185                 goto out;
 186 set_brk:
 187         mm->brk = brk;
 188 out:
 189         retval = mm->brk;
 190         up_write(&mm->mmap_sem);
 191         return retval;
 192 }
 193
 194 /* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
 195  * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
 196  * into "VM_xxx".
 197  */
 198 static inline unsigned long
 199 calc_vm_flags(unsigned long prot, unsigned long flags)
 200 {
 201 #define _trans(x,bit1,bit2) \
 202 ((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
 203
 204         unsigned long prot_bits, flag_bits;
 205         prot_bits =
 206                 _trans(prot, PROT_READ, VM_READ) |
 207                 _trans(prot, PROT_WRITE, VM_WRITE) |
 208                 _trans(prot, PROT_EXEC, VM_EXEC);
 209         flag_bits =
 210                 _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
 211                 _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
 212                 _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
 213         return prot_bits | flag_bits;
 214 #undef _trans
 215 }
 216
 217 #ifdef DEBUG_MM_RB
 218 static int browse_rb(struct rb_node * rb_node) {
 219         int i = 0;
 220         if (rb_node) {
 221                 i++;
 222                 i += browse_rb(rb_node->rb_left);
 223                 i += browse_rb(rb_node->rb_right);
 224         }
 225         return i;
 226 }
 227
 228 static void validate_mm(struct mm_struct * mm) {
 229         int bug = 0;
 230         int i = 0;
 231         struct vm_area_struct * tmp = mm->mmap;
 232         while (tmp) {
 233                 tmp = tmp->vm_next;
 234                 i++;
 235         }
 236         if (i != mm->map_count)
 237                 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
 238         i = browse_rb(mm->mm_rb.rb_node);
 239         if (i != mm->map_count)
 240                 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
 241         if (bug)
 242                 BUG();
 243 }
 244 #else
 245 #define validate_mm(mm) do { } while (0)
 246 #endif
 247
 248 static struct vm_area_struct *
 249 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 250                 struct vm_area_struct **pprev, struct rb_node ***rb_link,
 251                 struct rb_node ** rb_parent)
 252 {
 253         struct vm_area_struct * vma;
 254         struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
 255
 256         __rb_link = &mm->mm_rb.rb_node;
 257         rb_prev = __rb_parent = NULL;
 258         vma = NULL;
 259
 260         while (*__rb_link) {
 261                 struct vm_area_struct *vma_tmp;
 262
 263                 __rb_parent = *__rb_link;
 264                 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 265
 266                 if (vma_tmp->vm_end > addr) {
 267                         vma = vma_tmp;
 268                         if (vma_tmp->vm_start <= addr)
 269                                 return vma;
 270                         __rb_link = &__rb_parent->rb_left;
 271                 } else {
 272                         rb_prev = __rb_parent;
 273                         __rb_link = &__rb_parent->rb_right;
 274                 }
 275         }
 276
 277         *pprev = NULL;
 278         if (rb_prev)
 279                 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 280         *rb_link = __rb_link;
 281         *rb_parent = __rb_parent;
 282         return vma;
 283 }
 284
 285 static inline void
 286 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 287                 struct vm_area_struct *prev, struct rb_node *rb_parent)
 288 {
 289         if (prev) {
 290                 vma->vm_next = prev->vm_next;
 291                 prev->vm_next = vma;
 292         } else {
 293                 mm->mmap = vma;
 294                 if (rb_parent)
 295                         vma->vm_next = rb_entry(rb_parent,
 296                                         struct vm_area_struct, vm_rb);
 297                 else
 298                         vma->vm_next = NULL;
 299         }
 300 }
 301
 302 static void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 303                         struct rb_node **rb_link, struct rb_node *rb_parent)
 304 {
 305         rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 306         rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 307 }
 308
 309 static inline void __vma_link_file(struct vm_area_struct *vma)
 310 {
 311         struct file * file;
 312
 313         file = vma->vm_file;
 314         if (file) {
 315                 struct inode * inode = file->f_dentry->d_inode;
 316                 struct address_space *mapping = inode->i_mapping;
 317
 318                 if (vma->vm_flags & VM_DENYWRITE)
 319                         atomic_dec(&inode->i_writecount);
 320
 321                 if (vma->vm_flags & VM_SHARED)
 322                         list_add_tail(&vma->shared, &mapping->i_mmap_shared);
 323                 else
 324                         list_add_tail(&vma->shared, &mapping->i_mmap);
 325         }
 326 }
 327
 328 static void
 329 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 330         struct vm_area_struct *prev, struct rb_node **rb_link,
 331         struct rb_node *rb_parent)
 332 {
 333         __vma_link_list(mm, vma, prev, rb_parent);
 334         __vma_link_rb(mm, vma, rb_link, rb_parent);
 335         __vma_link_file(vma);
 336 }
 337
 338 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 339                         struct vm_area_struct *prev, struct rb_node **rb_link,
 340                         struct rb_node *rb_parent)
 341 {
 342         struct address_space *mapping = NULL;
 343
 344         if (vma->vm_file)
 345                 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 346
 347         if (mapping)
 348                 down(&mapping->i_shared_sem);
 349         spin_lock(&mm->page_table_lock);
 350         __vma_link(mm, vma, prev, rb_link, rb_parent);
 351         spin_unlock(&mm->page_table_lock);
 352         if (mapping)
 353                 up(&mapping->i_shared_sem);
 354
 355         mark_mm_hugetlb(mm, vma);
 356         mm->map_count++;
 357         validate_mm(mm);
 358 }
 359
 360 /*
 361  * If the vma has a ->close operation then the driver probably needs to release
 362  * per-vma resources, so we don't attempt to merge those.
 363  */
 364 #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
 365
 366 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 367                         struct file *file, unsigned long vm_flags)
 368 {
 369         if (vma->vm_ops && vma->vm_ops->close)
 370                 return 0;
 371         if (vma->vm_file != file)
 372                 return 0;
 373         if (vma->vm_flags != vm_flags)
 374                 return 0;
 375         if (vma->vm_private_data)
 376                 return 0;
 377         return 1;
 378 }
 379
 380 /*
 381  * Return true if we can merge this (vm_flags,file,vm_pgoff,size)
 382  * in front of (at a lower virtual address and file offset than) the vma.
 383  *
 384  * We don't check here for the merged mmap wrapping around the end of pagecache
 385  * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
 386  * wrap, nor mmaps which cover the final page at index -1UL.
 387  */
 388 static int
 389 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 390         struct file *file, unsigned long vm_pgoff, unsigned long size)
 391 {
 392         if (is_mergeable_vma(vma, file, vm_flags)) {
 393                 if (!file)
 394                         return 1;       /* anon mapping */
 395                 if (vma->vm_pgoff == vm_pgoff + size)
 396                         return 1;
 397         }
 398         return 0;
 399 }
 400
 401 /*
 402  * Return true if we can merge this (vm_flags,file,vm_pgoff)
 403  * beyond (at a higher virtual address and file offset than) the vma.
 404  */
 405 static int
 406 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 407         struct file *file, unsigned long vm_pgoff)
 408 {
 409         if (is_mergeable_vma(vma, file, vm_flags)) {
 410                 unsigned long vma_size;
 411
 412                 if (!file)
 413                         return 1;       /* anon mapping */
 414
 415                 vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 416                 if (vma->vm_pgoff + vma_size == vm_pgoff)
 417                         return 1;
 418         }
 419         return 0;
 420 }
 421
 422 /*
 423  * Given a new mapping request (addr,end,vm_flags,file,pgoff), figure out
 424  * whether that can be merged with its predecessor or its successor.  Or
 425  * both (it neatly fills a hole).
 426  */
 427 static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 428                         struct rb_node *rb_parent, unsigned long addr,
 429                         unsigned long end, unsigned long vm_flags,
 430                         struct file *file, unsigned long pgoff)
 431 {
 432         spinlock_t * lock = &mm->page_table_lock;
 433
 434         /*
 435          * We later require that vma->vm_flags == vm_flags, so this tests
 436          * vma->vm_flags & VM_SPECIAL, too.
 437          */
 438         if (vm_flags & VM_SPECIAL)
 439                 return 0;
 440
 441         if (!prev) {
 442                 prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
 443                 goto merge_next;
 444         }
 445
 446         /*
 447          * Can it merge with the predecessor?
 448          */
 449         if (prev->vm_end == addr &&
 450                         is_mergeable_vma(prev, file, vm_flags) &&
 451                         can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 452                 struct vm_area_struct *next;
 453                 struct inode *inode = file ? file->f_dentry->d_inode : NULL;
 454                 int need_up = 0;
 455
 456                 if (unlikely(file && prev->vm_next &&
 457                                 prev->vm_next->vm_file == file)) {
 458                         down(&inode->i_mapping->i_shared_sem);
 459                         need_up = 1;
 460                 }
 461                 spin_lock(lock);
 462                 prev->vm_end = end;
 463
 464                 /*
 465                  * OK, it did.  Can we now merge in the successor as well?
 466                  */
 467                 next = prev->vm_next;
 468                 if (next && prev->vm_end == next->vm_start &&
 469                                 can_vma_merge_before(next, vm_flags, file,
 470                                         pgoff, (end - addr) >> PAGE_SHIFT)) {
 471                         prev->vm_end = next->vm_end;
 472                         __vma_unlink(mm, next, prev);
 473                         __remove_shared_vm_struct(next, inode);
 474                         spin_unlock(lock);
 475                         if (need_up)
 476                                 up(&inode->i_mapping->i_shared_sem);
 477                         if (file)
 478                                 fput(file);
 479
 480                         mm->map_count--;
 481                         kmem_cache_free(vm_area_cachep, next);
 482                         return 1;
 483                 }
 484                 spin_unlock(lock);
 485                 if (need_up)
 486                         up(&inode->i_mapping->i_shared_sem);
 487                 return 1;
 488         }
 489
 490         /*
 491          * Can this new request be merged in front of prev->vm_next?
 492          */
 493         prev = prev->vm_next;
 494         if (prev) {
 495  merge_next:
 496                 if (!can_vma_merge_before(prev, vm_flags, file,
 497                                 pgoff, (end - addr) >> PAGE_SHIFT))
 498                         return 0;
 499                 if (end == prev->vm_start) {
 500                         spin_lock(lock);
 501                         prev->vm_start = addr;
 502                         prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
 503                         spin_unlock(lock);
 504                         return 1;
 505                 }
 506         }
 507
 508         return 0;
 509 }
 510
 511 /*
 512  * The caller must hold down_write(current->mm->mmap_sem).
 513  */
 514
 515 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 516                         unsigned long len, unsigned long prot,
 517                         unsigned long flags, unsigned long pgoff)
 518 {
 519         struct mm_struct * mm = current->mm;
 520         struct vm_area_struct * vma, * prev;
 521         struct inode *inode;
 522         unsigned int vm_flags;
 523         int correct_wcount = 0;
 524         int error;
 525         struct rb_node ** rb_link, * rb_parent;
 526         unsigned long charged = 0;
 527
 528         if (file && (!file->f_op || !file->f_op->mmap))
 529                 return -ENODEV;
 530
 531         if (!len)
 532                 return addr;
 533
 534         if (len > TASK_SIZE)
 535                 return -EINVAL;
 536
 537         len = PAGE_ALIGN(len);
 538
 539         /* offset overflow? */
 540         if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
 541                 return -EINVAL;
 542
 543         /* Too many mappings? */
 544         if (mm->map_count > MAX_MAP_COUNT)
 545                 return -ENOMEM;
 546
 547         /* Obtain the address to map to. we verify (or select) it and ensure
 548          * that it represents a valid section of the address space.
 549          */
 550         addr = get_unmapped_area(file, addr, len, pgoff, flags);
 551         if (addr & ~PAGE_MASK)
 552                 return addr;
 553
 554         /* Do simple checking here so the lower-level routines won't have
 555          * to. we assume access permissions have been handled by the open
 556          * of the memory object, so we don't do any here.
 557          */
 558         vm_flags = calc_vm_flags(prot,flags) | mm->def_flags |
 559                         VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 560
 561         if (flags & MAP_LOCKED) {
 562                 if (!capable(CAP_IPC_LOCK))
 563                         return -EPERM;
 564                 vm_flags |= VM_LOCKED;
 565         }
 566         /* mlock MCL_FUTURE? */
 567         if (vm_flags & VM_LOCKED) {
 568                 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 569                 locked += len;
 570                 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 571                         return -EAGAIN;
 572         }
 573
 574         inode = file ? file->f_dentry->d_inode : NULL;
 575
 576         if (file) {
 577                 switch (flags & MAP_TYPE) {
 578                 case MAP_SHARED:
 579                         if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 580                                 return -EACCES;
 581
 582                         /*
 583                          * Make sure we don't allow writing to an append-only
 584                          * file..
 585                          */
 586                         if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
 587                                 return -EACCES;
 588
 589                         /*
 590                          * Make sure there are no mandatory locks on the file.
 591                          */
 592                         if (locks_verify_locked(inode))
 593                                 return -EAGAIN;
 594
 595                         vm_flags |= VM_SHARED | VM_MAYSHARE;
 596                         if (!(file->f_mode & FMODE_WRITE))
 597                                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
 598
 599                         /* fall through */
 600                 case MAP_PRIVATE:
 601                         if (!(file->f_mode & FMODE_READ))
 602                                 return -EACCES;
 603                         break;
 604
 605                 default:
 606                         return -EINVAL;
 607                 }
 608         } else {
 609                 vm_flags |= VM_SHARED | VM_MAYSHARE;
 610                 switch (flags & MAP_TYPE) {
 611                 default:
 612                         return -EINVAL;
 613                 case MAP_PRIVATE:
 614                         vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
 615                         /* fall through */
 616                 case MAP_SHARED:
 617                         break;
 618                 }
 619         }
 620
 621         error = security_file_mmap(file, prot, flags);
 622         if (error)
 623                 return error;
 624
 625         /* Clear old maps */
 626         error = -ENOMEM;
 627 munmap_back:
 628         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 629         if (vma && vma->vm_start < addr + len) {
 630                 if (do_munmap(mm, addr, len))
 631                         return -ENOMEM;
 632                 goto munmap_back;
 633         }
 634
 635         /* Check against address space limit. */
 636         if ((mm->total_vm << PAGE_SHIFT) + len
 637             > current->rlim[RLIMIT_AS].rlim_cur)
 638                 return -ENOMEM;
 639
 640         if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) {
 641                 if (vm_flags & VM_SHARED) {
 642                         /* Check memory availability in shmem_file_setup? */
 643                         vm_flags |= VM_ACCOUNT;
 644                 } else if (vm_flags & VM_WRITE) {
 645                         /*
 646                          * Private writable mapping: check memory availability
 647                          */
 648                         charged = len >> PAGE_SHIFT;
 649                         if (!vm_enough_memory(charged))
 650                                 return -ENOMEM;
 651                         vm_flags |= VM_ACCOUNT;
 652                 }
 653         }
 654
 655         /* Can we just expand an old anonymous mapping? */
 656         if (!file && !(vm_flags & VM_SHARED) && rb_parent)
 657                 if (vma_merge(mm, prev, rb_parent, addr, addr + len,
 658                                         vm_flags, NULL, 0))
 659                         goto out;
 660
 661         /*
 662          * Determine the object being mapped and call the appropriate
 663          * specific mapper. the address has already been validated, but
 664          * not unmapped, but the maps are removed from the list.
 665          */
 666         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 667         error = -ENOMEM;
 668         if (!vma)
 669                 goto unacct_error;
 670
 671         vma->vm_mm = mm;
 672         vma->vm_start = addr;
 673         vma->vm_end = addr + len;
 674         vma->vm_flags = vm_flags;
 675         vma->vm_page_prot = protection_map[vm_flags & 0x0f];
 676         vma->vm_ops = NULL;
 677         vma->vm_pgoff = pgoff;
 678         vma->vm_file = NULL;
 679         vma->vm_private_data = NULL;
 680         vma->vm_next = NULL;
 681         INIT_LIST_HEAD(&vma->shared);
 682
 683         if (file) {
 684                 error = -EINVAL;
 685                 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 686                         goto free_vma;
 687                 if (vm_flags & VM_DENYWRITE) {
 688                         error = deny_write_access(file);
 689                         if (error)
 690                                 goto free_vma;
 691                         correct_wcount = 1;
 692                 }
 693                 vma->vm_file = file;
 694                 get_file(file);
 695                 error = file->f_op->mmap(file, vma);
 696                 if (error)
 697                         goto unmap_and_free_vma;
 698         } else if (vm_flags & VM_SHARED) {
 699                 error = shmem_zero_setup(vma);
 700                 if (error)
 701                         goto free_vma;
 702         }
 703
 704         /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 705          * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 706          * that memory reservation must be checked; but that reservation
 707          * belongs to shared memory object, not to vma: so now clear it.
 708          */
 709         if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 710                 vma->vm_flags &= ~VM_ACCOUNT;
 711
 712         /* Can addr have changed??
 713          *
 714          * Answer: Yes, several device drivers can do it in their
 715          *         f_op->mmap method. -DaveM
 716          */
 717         addr = vma->vm_start;
 718
 719         if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
 720                                 addr + len, vma->vm_flags, file, pgoff)) {
 721                 vma_link(mm, vma, prev, rb_link, rb_parent);
 722                 if (correct_wcount)
 723                         atomic_inc(&inode->i_writecount);
 724         } else {
 725                 if (file) {
 726                         if (correct_wcount)
 727                                 atomic_inc(&inode->i_writecount);
 728                         fput(file);
 729                 }
 730                 kmem_cache_free(vm_area_cachep, vma);
 731         }
 732 out:
 733         mm->total_vm += len >> PAGE_SHIFT;
 734         if (vm_flags & VM_LOCKED) {
 735                 mm->locked_vm += len >> PAGE_SHIFT;
 736                 make_pages_present(addr, addr + len);
 737         }
 738         if (flags & MAP_POPULATE) {
 739                 up_write(&mm->mmap_sem);
 740                 sys_remap_file_pages(addr, len, prot,
 741                                         pgoff, flags & MAP_NONBLOCK);
 742                 down_write(&mm->mmap_sem);
 743         }
 744         return addr;
 745
 746 unmap_and_free_vma:
 747         if (correct_wcount)
 748                 atomic_inc(&inode->i_writecount);
 749         vma->vm_file = NULL;
 750         fput(file);
 751
 752         /* Undo any partial mapping done by a device driver. */
 753         zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 754 free_vma:
 755         kmem_cache_free(vm_area_cachep, vma);
 756 unacct_error:
 757         if (charged)
 758                 vm_unacct_memory(charged);
 759         return error;
 760 }
 761
 762 /* Get an address range which is currently unmapped.
 763  * For shmat() with addr=0.
 764  *
 765  * Ugly calling convention alert:
 766  * Return value with the low bits set means error value,
 767  * ie
 768  *      if (ret & ~PAGE_MASK)
 769  *              error = ret;
 770  *
 771  * This function "knows" that -ENOMEM has the bits set.
 772  */
 773 #ifndef HAVE_ARCH_UNMAPPED_AREA
 774 static inline unsigned long
 775 arch_get_unmapped_area(struct file *filp, unsigned long addr,
 776                 unsigned long len, unsigned long pgoff, unsigned long flags)
 777 {
 778         struct mm_struct *mm = current->mm;
 779         struct vm_area_struct *vma;
 780         unsigned long start_addr;
 781
 782         if (len > TASK_SIZE)
 783                 return -ENOMEM;
 784
 785         if (addr) {
 786                 addr = PAGE_ALIGN(addr);
 787                 vma = find_vma(mm, addr);
 788                 if (TASK_SIZE - len >= addr &&
 789                     (!vma || addr + len <= vma->vm_start))
 790                         return addr;
 791         }
 792         start_addr = addr = mm->free_area_cache;
 793
 794 full_search:
 795         for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 796                 /* At this point:  (!vma || addr < vma->vm_end). */
 797                 if (TASK_SIZE - len < addr) {
 798                         /*
 799                          * Start a new search - just in case we missed
 800                          * some holes.
 801                          */
 802                         if (start_addr != TASK_UNMAPPED_BASE) {
 803                                 start_addr = addr = TASK_UNMAPPED_BASE;
 804                                 goto full_search;
 805                         }
 806                         return -ENOMEM;
 807                 }
 808                 if (!vma || addr + len <= vma->vm_start) {
 809                         /*
 810                          * Remember the place where we stopped the search:
 811                          */
 812                         mm->free_area_cache = addr + len;
 813                         return addr;
 814                 }
 815                 addr = vma->vm_end;
 816         }
 817 }
 818 #else
 819 extern unsigned long
 820 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 821                         unsigned long, unsigned long);
 822 #endif
 823
 824 unsigned long
 825 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 826                 unsigned long pgoff, unsigned long flags)
 827 {
 828         if (flags & MAP_FIXED) {
 829                 unsigned long ret;
 830
 831                 if (addr > TASK_SIZE - len)
 832                         return -ENOMEM;
 833                 if (addr & ~PAGE_MASK)
 834                         return -EINVAL;
 835                 if (file && is_file_hugepages(file))  {
 836                         /*
 837                          * Make sure that addr and length are properly aligned.
 838                          */
 839                         ret = is_aligned_hugepage_range(addr, len);
 840                 } else {
 841                         /*
 842                          * Ensure that a normal request is not falling in a
 843                          * reserved hugepage range.  For some archs like IA-64,
 844                          * there is a separate region for hugepages.
 845                          */
 846                         ret = is_hugepage_only_range(addr, len);
 847                 }
 848                 if (ret)
 849                         return -EINVAL;
 850                 return addr;
 851         }
 852
 853         if (file && file->f_op && file->f_op->get_unmapped_area)
 854                 return file->f_op->get_unmapped_area(file, addr, len,
 855                                                 pgoff, flags);
 856
 857         return arch_get_unmapped_area(file, addr, len, pgoff, flags);
 858 }
 859
 860 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 861 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 862 {
 863         struct vm_area_struct *vma = NULL;
 864
 865         if (mm) {
 866                 /* Check the cache first. */
 867                 /* (Cache hit rate is typically around 35%.) */
 868                 vma = mm->mmap_cache;
 869                 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 870                         struct rb_node * rb_node;
 871
 872                         rb_node = mm->mm_rb.rb_node;
 873                         vma = NULL;
 874
 875                         while (rb_node) {
 876                                 struct vm_area_struct * vma_tmp;
 877
 878                                 vma_tmp = rb_entry(rb_node,
 879                                                 struct vm_area_struct, vm_rb);
 880
 881                                 if (vma_tmp->vm_end > addr) {
 882                                         vma = vma_tmp;
 883                                         if (vma_tmp->vm_start <= addr)
 884                                                 break;
 885                                         rb_node = rb_node->rb_left;
 886                                 } else
 887                                         rb_node = rb_node->rb_right;
 888                         }
 889                         if (vma)
 890                                 mm->mmap_cache = vma;
 891                 }
 892         }
 893         return vma;
 894 }
 895
 896 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 897 struct vm_area_struct *
 898 find_vma_prev(struct mm_struct *mm, unsigned long addr,
 899                         struct vm_area_struct **pprev)
 900 {
 901         struct vm_area_struct *vma = NULL, *prev = NULL;
 902         struct rb_node * rb_node;
 903         if (!mm)
 904                 goto out;
 905
 906         /* Guard against addr being lower than the first VMA */
 907         vma = mm->mmap;
 908
 909         /* Go through the RB tree quickly. */
 910         rb_node = mm->mm_rb.rb_node;
 911
 912         while (rb_node) {
 913                 struct vm_area_struct *vma_tmp;
 914                 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 915
 916                 if (addr < vma_tmp->vm_end) {
 917                         rb_node = rb_node->rb_left;
 918                 } else {
 919                         prev = vma_tmp;
 920                         if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 921                                 break;
 922                         rb_node = rb_node->rb_right;
 923                 }
 924         }
 925
 926 out:
 927         *pprev = prev;
 928         return prev ? prev->vm_next : vma;
 929 }
 930
 931 #ifdef CONFIG_STACK_GROWSUP
 932 /*
 933  * vma is the first one with address > vma->vm_end.  Have to extend vma.
 934  */
 935 int expand_stack(struct vm_area_struct * vma, unsigned long address)
 936 {
 937         unsigned long grow;
 938
 939         if (!(vma->vm_flags & VM_GROWSUP))
 940                 return -EFAULT;
 941
 942         /*
 943          * vma->vm_start/vm_end cannot change under us because the caller
 944          * is required to hold the mmap_sem in read mode. We need to get
 945          * the spinlock only before relocating the vma range ourself.
 946          */
 947         address += 4 + PAGE_SIZE - 1;
 948         address &= PAGE_MASK;
 949         spin_lock(&vma->vm_mm->page_table_lock);
 950         grow = (address - vma->vm_end) >> PAGE_SHIFT;
 951
 952         /* Overcommit.. */
 953         if (!vm_enough_memory(grow)) {
 954                 spin_unlock(&vma->vm_mm->page_table_lock);
 955                 return -ENOMEM;
 956         }
 957
 958         if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
 959                         ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 960                         current->rlim[RLIMIT_AS].rlim_cur) {
 961                 spin_unlock(&vma->vm_mm->page_table_lock);
 962                 vm_unacct_memory(grow);
 963                 return -ENOMEM;
 964         }
 965         vma->vm_end = address;
 966         vma->vm_mm->total_vm += grow;
 967         if (vma->vm_flags & VM_LOCKED)
 968                 vma->vm_mm->locked_vm += grow;
 969         spin_unlock(&vma->vm_mm->page_table_lock);
 970         return 0;
 971 }
 972
 973 struct vm_area_struct *
 974 find_extend_vma(struct mm_struct *mm, unsigned long addr)
 975 {
 976         struct vm_area_struct *vma, *prev;
 977
 978         addr &= PAGE_MASK;
 979         vma = find_vma_prev(mm, addr, &prev);
 980         if (vma && (vma->vm_start <= addr))
 981                 return vma;
 982         if (!prev || expand_stack(prev, addr))
 983                 return NULL;
 984         if (prev->vm_flags & VM_LOCKED) {
 985                 make_pages_present(addr, prev->vm_end);
 986         }
 987         return prev;
 988 }
 989 #else
 990 /*
 991  * vma is the first one with address < vma->vm_start.  Have to extend vma.
 992  */
 993 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 994 {
 995         unsigned long grow;
 996
 997         /*
 998          * vma->vm_start/vm_end cannot change under us because the caller
 999          * is required to hold the mmap_sem in read mode. We need to get
1000          * the spinlock only before relocating the vma range ourself.
1001          */
1002         address &= PAGE_MASK;
1003         spin_lock(&vma->vm_mm->page_table_lock);
1004         grow = (vma->vm_start - address) >> PAGE_SHIFT;
1005
1006         /* Overcommit.. */
1007         if (!vm_enough_memory(grow)) {
1008                 spin_unlock(&vma->vm_mm->page_table_lock);
1009                 return -ENOMEM;
1010         }
1011
1012         if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
1013                         ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
1014                         current->rlim[RLIMIT_AS].rlim_cur) {
1015                 spin_unlock(&vma->vm_mm->page_table_lock);
1016                 vm_unacct_memory(grow);
1017                 return -ENOMEM;
1018         }
1019         vma->vm_start = address;
1020         vma->vm_pgoff -= grow;
1021         vma->vm_mm->total_vm += grow;
1022         if (vma->vm_flags & VM_LOCKED)
1023                 vma->vm_mm->locked_vm += grow;
1024         spin_unlock(&vma->vm_mm->page_table_lock);
1025         return 0;
1026 }
1027
1028 struct vm_area_struct *
1029 find_extend_vma(struct mm_struct * mm, unsigned long addr)
1030 {
1031         struct vm_area_struct * vma;
1032         unsigned long start;
1033
1034         addr &= PAGE_MASK;
1035         vma = find_vma(mm,addr);
1036         if (!vma)
1037                 return NULL;
1038         if (vma->vm_start <= addr)
1039                 return vma;
1040         if (!(vma->vm_flags & VM_GROWSDOWN))
1041                 return NULL;
1042         start = vma->vm_start;
1043         if (expand_stack(vma, addr))
1044                 return NULL;
1045         if (vma->vm_flags & VM_LOCKED) {
1046                 make_pages_present(addr, start);
1047         }
1048         return vma;
1049 }
1050 #endif
1051
1052 /*
1053  * Try to free as many page directory entries as we can,
1054  * without having to work very hard at actually scanning
1055  * the page tables themselves.
1056  *
1057  * Right now we try to free page tables if we have a nice
1058  * PGDIR-aligned area that got free'd up. We could be more
1059  * granular if we want to, but this is fast and simple,
1060  * and covers the bad cases.
1061  *
1062  * "prev", if it exists, points to a vma before the one
1063  * we just free'd - but there's no telling how much before.
1064  */
1065 static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
1066         unsigned long start, unsigned long end)
1067 {
1068         unsigned long first = start & PGDIR_MASK;
1069         unsigned long last = end + PGDIR_SIZE - 1;
1070         unsigned long start_index, end_index;
1071         struct mm_struct *mm = tlb->mm;
1072
1073         if (!prev) {
1074                 prev = mm->mmap;
1075                 if (!prev)
1076                         goto no_mmaps;
1077                 if (prev->vm_end > start) {
1078                         if (last > prev->vm_start)
1079                                 last = prev->vm_start;
1080                         goto no_mmaps;
1081                 }
1082         }
1083         for (;;) {
1084                 struct vm_area_struct *next = prev->vm_next;
1085
1086                 if (next) {
1087                         if (next->vm_start < start) {
1088                                 prev = next;
1089                                 continue;
1090                         }
1091                         if (last > next->vm_start)
1092                                 last = next->vm_start;
1093                 }
1094                 if (prev->vm_end > first)
1095                         first = prev->vm_end + PGDIR_SIZE - 1;
1096                 break;
1097         }
1098 no_mmaps:
1099         if (last < first)       /* for arches with discontiguous pgd indices */
1100                 return;
1101         /*
1102          * If the PGD bits are not consecutive in the virtual address, the
1103          * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
1104          */
1105         start_index = pgd_index(first);
1106         if (start_index < FIRST_USER_PGD_NR)
1107                 start_index = FIRST_USER_PGD_NR;
1108         end_index = pgd_index(last);
1109         if (end_index > start_index) {
1110                 clear_page_tables(tlb, start_index, end_index - start_index);
1111                 flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
1112         }
1113 }
1114
1115 /* Normal function to fix up a mapping
1116  * This function is the default for when an area has no specific
1117  * function.  This may be used as part of a more specific routine.
1118  *
1119  * By the time this function is called, the area struct has been
1120  * removed from the process mapping list.
1121  */
1122 static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1123 {
1124         size_t len = area->vm_end - area->vm_start;
1125
1126         area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1127         if (area->vm_flags & VM_LOCKED)
1128                 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1129         /*
1130          * Is this a new hole at the lowest possible address?
1131          */
1132         if (area->vm_start >= TASK_UNMAPPED_BASE &&
1133                                 area->vm_start < area->vm_mm->free_area_cache)
1134               area->vm_mm->free_area_cache = area->vm_start;
1135
1136         remove_shared_vm_struct(area);
1137
1138         if (area->vm_ops && area->vm_ops->close)
1139                 area->vm_ops->close(area);
1140         if (area->vm_file)
1141                 fput(area->vm_file);
1142         kmem_cache_free(vm_area_cachep, area);
1143 }
1144
1145 /*
1146  * Update the VMA and inode share lists.
1147  *
1148  * Ok - we have the memory areas we should free on the 'free' list,
1149  * so release them, and do the vma updates.
1150  */
1151 static void unmap_vma_list(struct mm_struct *mm,
1152         struct vm_area_struct *mpnt)
1153 {
1154         do {
1155                 struct vm_area_struct *next = mpnt->vm_next;
1156                 unmap_vma(mm, mpnt);
1157                 mpnt = next;
1158         } while (mpnt != NULL);
1159         validate_mm(mm);
1160 }
1161
1162 /*
1163  * Get rid of page table information in the indicated region.
1164  *
1165  * Called with the page table lock held.
1166  */
1167 static void unmap_region(struct mm_struct *mm,
1168         struct vm_area_struct *vma,
1169         struct vm_area_struct *prev,
1170         unsigned long start,
1171         unsigned long end)
1172 {
1173         struct mmu_gather *tlb;
1174         unsigned long nr_accounted = 0;
1175
1176         lru_add_drain();
1177         tlb = tlb_gather_mmu(mm, 0);
1178         unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
1179         vm_unacct_memory(nr_accounted);
1180         free_pgtables(tlb, prev, start, end);
1181         tlb_finish_mmu(tlb, start, end);
1182 }
1183
1184 /*
1185  * Create a list of vma's touched by the unmap, removing them from the mm's
1186  * vma list as we go..
1187  *
1188  * Called with the page_table_lock held.
1189  */
1190 static void
1191 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1192         struct vm_area_struct *prev, unsigned long end)
1193 {
1194         struct vm_area_struct **insertion_point;
1195         struct vm_area_struct *tail_vma = NULL;
1196
1197         insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1198         do {
1199                 rb_erase(&vma->vm_rb, &mm->mm_rb);
1200                 mm->map_count--;
1201                 tail_vma = vma;
1202                 vma = vma->vm_next;
1203         } while (vma && vma->vm_start < end);
1204         *insertion_point = vma;
1205         tail_vma->vm_next = NULL;
1206         mm->mmap_cache = NULL;          /* Kill the cache. */
1207 }
1208
1209 /*
1210  * Split a vma into two pieces at address 'addr', a new vma is allocated
1211  * either for the first part or the the tail.
1212  */
1213 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1214               unsigned long addr, int new_below)
1215 {
1216         struct vm_area_struct *new;
1217
1218         if (mm->map_count >= MAX_MAP_COUNT)
1219                 return -ENOMEM;
1220
1221         new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1222         if (!new)
1223                 return -ENOMEM;
1224
1225         /* most fields are the same, copy all, and then fixup */
1226         *new = *vma;
1227
1228         INIT_LIST_HEAD(&new->shared);
1229
1230         if (new_below) {
1231                 new->vm_end = addr;
1232                 vma->vm_start = addr;
1233                 vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
1234         } else {
1235                 vma->vm_end = addr;
1236                 new->vm_start = addr;
1237                 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1238         }
1239
1240         if (new->vm_file)
1241                 get_file(new->vm_file);
1242
1243         if (new->vm_ops && new->vm_ops->open)
1244                 new->vm_ops->open(new);
1245
1246         insert_vm_struct(mm, new);
1247         return 0;
1248 }
1249
1250 /* Munmap is split into 2 main parts -- this part which finds
1251  * what needs doing, and the areas themselves, which do the
1252  * work.  This now handles partial unmappings.
1253  * Jeremy Fitzhardinge <jeremy@goop.org>
1254  */
1255 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1256 {
1257         unsigned long end;
1258         struct vm_area_struct *mpnt, *prev, *last;
1259
1260         if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
1261                 return -EINVAL;
1262
1263         if ((len = PAGE_ALIGN(len)) == 0)
1264                 return -EINVAL;
1265
1266         /* Find the first overlapping VMA */
1267         mpnt = find_vma_prev(mm, start, &prev);
1268         if (!mpnt)
1269                 return 0;
1270         /* we have  start < mpnt->vm_end  */
1271
1272         if (is_vm_hugetlb_page(mpnt)) {
1273                 int ret = is_aligned_hugepage_range(start, len);
1274
1275                 if (ret)
1276                         return ret;
1277         }
1278
1279         /* if it doesn't overlap, we have nothing.. */
1280         end = start + len;
1281         if (mpnt->vm_start >= end)
1282                 return 0;
1283
1284         /* Something will probably happen, so notify. */
1285         if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
1286                 profile_exec_unmap(mm);
1287
1288         /*
1289          * If we need to split any vma, do it now to save pain later.
1290          *
1291          * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
1292          * unmapped vm_area_struct will remain in use: so lower split_vma
1293          * places tmp vma above, and higher split_vma places tmp vma below.
1294          */
1295         if (start > mpnt->vm_start) {
1296                 if (split_vma(mm, mpnt, start, 0))
1297                         return -ENOMEM;
1298                 prev = mpnt;
1299         }
1300
1301         /* Does it split the last one? */
1302         last = find_vma(mm, end);
1303         if (last && end > last->vm_start) {
1304                 if (split_vma(mm, last, end, 1))
1305                         return -ENOMEM;
1306         }
1307         mpnt = prev? prev->vm_next: mm->mmap;
1308
1309         /*
1310          * Remove the vma's, and unmap the actual pages
1311          */
1312         spin_lock(&mm->page_table_lock);
1313         detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1314         unmap_region(mm, mpnt, prev, start, end);
1315         spin_unlock(&mm->page_table_lock);
1316
1317         /* Fix up all other VM information */
1318         unmap_vma_list(mm, mpnt);
1319
1320         return 0;
1321 }
1322
1323 asmlinkage long sys_munmap(unsigned long addr, size_t len)
1324 {
1325         int ret;
1326         struct mm_struct *mm = current->mm;
1327
1328         down_write(&mm->mmap_sem);
1329         ret = do_munmap(mm, addr, len);
1330         up_write(&mm->mmap_sem);
1331         return ret;
1332 }
1333
1334 /*
1335  *  this is really a simplified "do_mmap".  it only handles
1336  *  anonymous maps.  eventually we may be able to do some
1337  *  brk-specific accounting here.
1338  */
1339 unsigned long do_brk(unsigned long addr, unsigned long len)
1340 {
1341         struct mm_struct * mm = current->mm;
1342         struct vm_area_struct * vma, * prev;
1343         unsigned long flags;
1344         struct rb_node ** rb_link, * rb_parent;
1345
1346         len = PAGE_ALIGN(len);
1347         if (!len)
1348                 return addr;
1349
1350         /*
1351          * mlock MCL_FUTURE?
1352          */
1353         if (mm->def_flags & VM_LOCKED) {
1354                 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
1355                 locked += len;
1356                 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
1357                         return -EAGAIN;
1358         }
1359
1360         /*
1361          * Clear old maps.  this also does some error checking for us
1362          */
1363  munmap_back:
1364         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1365         if (vma && vma->vm_start < addr + len) {
1366                 if (do_munmap(mm, addr, len))
1367                         return -ENOMEM;
1368                 goto munmap_back;
1369         }
1370
1371         /* Check against address space limits *after* clearing old maps... */
1372         if ((mm->total_vm << PAGE_SHIFT) + len
1373             > current->rlim[RLIMIT_AS].rlim_cur)
1374                 return -ENOMEM;
1375
1376         if (mm->map_count > MAX_MAP_COUNT)
1377                 return -ENOMEM;
1378
1379         if (!vm_enough_memory(len >> PAGE_SHIFT))
1380                 return -ENOMEM;
1381
1382         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1383
1384         /* Can we just expand an old anonymous mapping? */
1385         if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len,
1386                                         flags, NULL, 0))
1387                 goto out;
1388
1389         /*
1390          * create a vma struct for an anonymous mapping
1391          */
1392         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1393         if (!vma) {
1394                 vm_unacct_memory(len >> PAGE_SHIFT);
1395                 return -ENOMEM;
1396         }
1397
1398         vma->vm_mm = mm;
1399         vma->vm_start = addr;
1400         vma->vm_end = addr + len;
1401         vma->vm_flags = flags;
1402         vma->vm_page_prot = protection_map[flags & 0x0f];
1403         vma->vm_ops = NULL;
1404         vma->vm_pgoff = 0;
1405         vma->vm_file = NULL;
1406         vma->vm_private_data = NULL;
1407         INIT_LIST_HEAD(&vma->shared);
1408
1409         vma_link(mm, vma, prev, rb_link, rb_parent);
1410
1411 out:
1412         mm->total_vm += len >> PAGE_SHIFT;
1413         if (flags & VM_LOCKED) {
1414                 mm->locked_vm += len >> PAGE_SHIFT;
1415                 make_pages_present(addr, addr + len);
1416         }
1417         return addr;
1418 }
1419
1420 /* Build the RB tree corresponding to the VMA list. */
1421 void build_mmap_rb(struct mm_struct * mm)
1422 {
1423         struct vm_area_struct * vma;
1424         struct rb_node ** rb_link, * rb_parent;
1425
1426         mm->mm_rb = RB_ROOT;
1427         rb_link = &mm->mm_rb.rb_node;
1428         rb_parent = NULL;
1429         for (vma = mm->mmap; vma; vma = vma->vm_next) {
1430                 __vma_link_rb(mm, vma, rb_link, rb_parent);
1431                 rb_parent = &vma->vm_rb;
1432                 rb_link = &rb_parent->rb_right;
1433         }
1434 }
1435
1436 /* Release all mmaps. */
1437 void exit_mmap(struct mm_struct *mm)
1438 {
1439         struct mmu_gather *tlb;
1440         struct vm_area_struct *vma;
1441         unsigned long nr_accounted = 0;
1442
1443         profile_exit_mmap(mm);
1444
1445         lru_add_drain();
1446
1447         spin_lock(&mm->page_table_lock);
1448
1449         tlb = tlb_gather_mmu(mm, 1);
1450         flush_cache_mm(mm);
1451         /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1452         mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1453                                         ~0UL, &nr_accounted);
1454         vm_unacct_memory(nr_accounted);
1455         BUG_ON(mm->map_count);  /* This is just debugging */
1456         clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
1457         tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1458
1459         vma = mm->mmap;
1460         mm->mmap = mm->mmap_cache = NULL;
1461         mm->mm_rb = RB_ROOT;
1462         mm->rss = 0;
1463         mm->total_vm = 0;
1464         mm->locked_vm = 0;
1465
1466         spin_unlock(&mm->page_table_lock);
1467
1468         /*
1469          * Walk the list again, actually closing and freeing it
1470          * without holding any MM locks.
1471          */
1472         while (vma) {
1473                 struct vm_area_struct *next = vma->vm_next;
1474                 remove_shared_vm_struct(vma);
1475                 if (vma->vm_ops) {
1476                         if (vma->vm_ops->close)
1477                                 vma->vm_ops->close(vma);
1478                 }
1479                 if (vma->vm_file)
1480                         fput(vma->vm_file);
1481                 kmem_cache_free(vm_area_cachep, vma);
1482                 vma = next;
1483         }
1484 }
1485
1486 /* Insert vm structure into process list sorted by address
1487  * and into the inode's i_mmap ring.  If vm_file is non-NULL
1488  * then i_shared_sem is taken here.
1489  */
1490 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
1491 {
1492         struct vm_area_struct * __vma, * prev;
1493         struct rb_node ** rb_link, * rb_parent;
1494
1495         __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
1496         if (__vma && __vma->vm_start < vma->vm_end)
1497                 BUG();
1498         vma_link(mm, vma, prev, rb_link, rb_parent);
1499         validate_mm(mm);
1500 }