Import 2.3.30pre7
[davej-history.git] / mm / mmap.c
blob822c654472af198dbb1b10b34ee21b438737e58b
1 /*
2 * linux/mm/mmap.c
4 * Written by obz.
5 */
6 #include <linux/slab.h>
7 #include <linux/shm.h>
8 #include <linux/mman.h>
9 #include <linux/pagemap.h>
10 #include <linux/swap.h>
11 #include <linux/swapctl.h>
12 #include <linux/smp_lock.h>
13 #include <linux/init.h>
14 #include <linux/file.h>
16 #include <asm/uaccess.h>
17 #include <asm/pgalloc.h>
19 /* description of effects of mapping type and prot in current implementation.
20 * this is due to the limited x86 page protection hardware. The expected
21 * behavior is in parens:
23 * map_type prot
24 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
25 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
26 * w: (no) no w: (no) no w: (yes) yes w: (no) no
27 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
29 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
30 * w: (no) no w: (no) no w: (copy) copy w: (no) no
31 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
34 pgprot_t protection_map[16] = {
35 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
36 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
39 /* SLAB cache for vm_area_struct's. */
40 kmem_cache_t *vm_area_cachep;
42 int sysctl_overcommit_memory;
44 /* Check that a process has enough memory to allocate a
45 * new virtual mapping.
47 int vm_enough_memory(long pages)
49 /* Stupid algorithm to decide if we have enough memory: while
50 * simple, it hopefully works in most obvious cases.. Easy to
51 * fool it, but this should catch most mistakes.
53 /* 23/11/98 NJC: Somewhat less stupid version of algorithm,
54 * which tries to do "TheRightThing". Instead of using half of
55 * (buffers+cache), use the minimum values. Allow an extra 2%
56 * of num_physpages for safety margin.
59 long free;
61 /* Sometimes we want to use more memory than we have. */
62 if (sysctl_overcommit_memory)
63 return 1;
65 free = atomic_read(&buffermem_pages);
66 free += atomic_read(&page_cache_size);
67 free += nr_free_pages();
68 free += nr_swap_pages;
69 return free > pages;
72 /* Remove one vm structure from the inode's i_mmap ring. */
73 static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
75 struct file * file = vma->vm_file;
77 if (file) {
78 if (vma->vm_flags & VM_DENYWRITE)
79 atomic_inc(&file->f_dentry->d_inode->i_writecount);
80 spin_lock(&file->f_dentry->d_inode->i_shared_lock);
81 if(vma->vm_next_share)
82 vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
83 *vma->vm_pprev_share = vma->vm_next_share;
84 spin_unlock(&file->f_dentry->d_inode->i_shared_lock);
89 * sys_brk() for the most part doesn't need the global kernel
90 * lock, except when an application is doing something nasty
91 * like trying to un-brk an area that has already been mapped
92 * to a regular file. in this case, the unmapping will need
93 * to invoke file system routines that need the global lock.
95 asmlinkage unsigned long sys_brk(unsigned long brk)
97 unsigned long rlim, retval;
98 unsigned long newbrk, oldbrk;
99 struct mm_struct *mm = current->mm;
101 down(&mm->mmap_sem);
103 if (brk < mm->end_code)
104 goto out;
105 newbrk = PAGE_ALIGN(brk);
106 oldbrk = PAGE_ALIGN(mm->brk);
107 if (oldbrk == newbrk)
108 goto set_brk;
110 /* Always allow shrinking brk. */
111 if (brk <= mm->brk) {
112 if (!do_munmap(newbrk, oldbrk-newbrk))
113 goto set_brk;
114 goto out;
117 /* Check against rlimit and stack.. */
118 rlim = current->rlim[RLIMIT_DATA].rlim_cur;
119 if (rlim < RLIM_INFINITY && brk - mm->end_code > rlim)
120 goto out;
122 /* Check against existing mmap mappings. */
123 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
124 goto out;
126 /* Check if we have enough memory.. */
127 if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
128 goto out;
130 /* Ok, looks good - let it rip. */
131 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
132 goto out;
133 set_brk:
134 mm->brk = brk;
135 out:
136 retval = mm->brk;
137 up(&mm->mmap_sem);
138 return retval;
141 /* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
142 * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
143 * into "VM_xxx".
145 static inline unsigned long vm_flags(unsigned long prot, unsigned long flags)
147 #define _trans(x,bit1,bit2) \
148 ((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
150 unsigned long prot_bits, flag_bits;
151 prot_bits =
152 _trans(prot, PROT_READ, VM_READ) |
153 _trans(prot, PROT_WRITE, VM_WRITE) |
154 _trans(prot, PROT_EXEC, VM_EXEC);
155 flag_bits =
156 _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
157 _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
158 _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
159 return prot_bits | flag_bits;
160 #undef _trans
163 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
164 unsigned long prot, unsigned long flags, unsigned long pgoff)
166 struct mm_struct * mm = current->mm;
167 struct vm_area_struct * vma;
168 int error;
170 if (file && (!file->f_op || !file->f_op->mmap))
171 return -ENODEV;
173 if ((len = PAGE_ALIGN(len)) == 0)
174 return addr;
176 if (len > TASK_SIZE || addr > TASK_SIZE-len)
177 return -EINVAL;
179 /* offset overflow? */
180 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
181 return -EINVAL;
183 /* Too many mappings? */
184 if (mm->map_count > MAX_MAP_COUNT)
185 return -ENOMEM;
187 /* mlock MCL_FUTURE? */
188 if (mm->def_flags & VM_LOCKED) {
189 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
190 locked += len;
191 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
192 return -EAGAIN;
195 /* Do simple checking here so the lower-level routines won't have
196 * to. we assume access permissions have been handled by the open
197 * of the memory object, so we don't do any here.
199 if (file != NULL) {
200 switch (flags & MAP_TYPE) {
201 case MAP_SHARED:
202 if ((prot & PROT_WRITE) && !(file->f_mode & 2))
203 return -EACCES;
205 /* Make sure we don't allow writing to an append-only file.. */
206 if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & 2))
207 return -EACCES;
209 /* make sure there are no mandatory locks on the file. */
210 if (locks_verify_locked(file->f_dentry->d_inode))
211 return -EAGAIN;
213 /* fall through */
214 case MAP_PRIVATE:
215 if (!(file->f_mode & 1))
216 return -EACCES;
217 break;
219 default:
220 return -EINVAL;
222 } else if ((flags & MAP_TYPE) != MAP_PRIVATE)
223 return -EINVAL;
225 /* Obtain the address to map to. we verify (or select) it and ensure
226 * that it represents a valid section of the address space.
228 if (flags & MAP_FIXED) {
229 if (addr & ~PAGE_MASK)
230 return -EINVAL;
231 } else {
232 addr = get_unmapped_area(addr, len);
233 if (!addr)
234 return -ENOMEM;
237 /* Determine the object being mapped and call the appropriate
238 * specific mapper. the address has already been validated, but
239 * not unmapped, but the maps are removed from the list.
241 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
242 if (!vma)
243 return -ENOMEM;
245 vma->vm_mm = mm;
246 vma->vm_start = addr;
247 vma->vm_end = addr + len;
248 vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
250 if (file) {
251 if (file->f_mode & 1)
252 vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
253 if (flags & MAP_SHARED) {
254 vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
256 /* This looks strange, but when we don't have the file open
257 * for writing, we can demote the shared mapping to a simpler
258 * private mapping. That also takes care of a security hole
259 * with ptrace() writing to a shared mapping without write
260 * permissions.
262 * We leave the VM_MAYSHARE bit on, just to get correct output
263 * from /proc/xxx/maps..
265 if (!(file->f_mode & 2))
266 vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
268 } else
269 vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
270 vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
271 vma->vm_ops = NULL;
272 vma->vm_pgoff = pgoff;
273 vma->vm_file = NULL;
274 vma->vm_private_data = NULL;
276 /* Clear old maps */
277 error = -ENOMEM;
278 if (do_munmap(addr, len))
279 goto free_vma;
281 /* Check against address space limit. */
282 if ((mm->total_vm << PAGE_SHIFT) + len
283 > current->rlim[RLIMIT_AS].rlim_cur)
284 goto free_vma;
286 /* Private writable mapping? Check memory availability.. */
287 if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
288 !(flags & MAP_NORESERVE) &&
289 !vm_enough_memory(len >> PAGE_SHIFT))
290 goto free_vma;
292 if (file) {
293 int correct_wcount = 0;
294 if (vma->vm_flags & VM_DENYWRITE) {
295 if (atomic_read(&file->f_dentry->d_inode->i_writecount) > 0) {
296 error = -ETXTBSY;
297 goto free_vma;
299 /* f_op->mmap might possibly sleep
300 * (generic_file_mmap doesn't, but other code
301 * might). In any case, this takes care of any
302 * race that this might cause.
304 atomic_dec(&file->f_dentry->d_inode->i_writecount);
305 correct_wcount = 1;
307 error = file->f_op->mmap(file, vma);
308 /* Fix up the count if necessary, then check for an error */
309 if (correct_wcount)
310 atomic_inc(&file->f_dentry->d_inode->i_writecount);
311 if (error)
312 goto unmap_and_free_vma;
313 vma->vm_file = file;
314 get_file(file);
318 * merge_segments may merge our vma, so we can't refer to it
319 * after the call. Save the values we need now ...
321 flags = vma->vm_flags;
322 addr = vma->vm_start; /* can addr have changed?? */
323 vmlist_modify_lock(mm);
324 insert_vm_struct(mm, vma);
325 merge_segments(mm, vma->vm_start, vma->vm_end);
326 vmlist_modify_unlock(mm);
328 mm->total_vm += len >> PAGE_SHIFT;
329 if (flags & VM_LOCKED) {
330 mm->locked_vm += len >> PAGE_SHIFT;
331 make_pages_present(addr, addr + len);
333 return addr;
335 unmap_and_free_vma:
336 /* Undo any partial mapping done by a device driver. */
337 flush_cache_range(mm, vma->vm_start, vma->vm_end);
338 zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
339 flush_tlb_range(mm, vma->vm_start, vma->vm_end);
340 free_vma:
341 kmem_cache_free(vm_area_cachep, vma);
342 return error;
345 /* Get an address range which is currently unmapped.
346 * For mmap() without MAP_FIXED and shmat() with addr=0.
347 * Return value 0 means ENOMEM.
349 unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
351 struct vm_area_struct * vmm;
353 if (len > TASK_SIZE)
354 return 0;
355 if (!addr)
356 addr = TASK_UNMAPPED_BASE;
357 addr = PAGE_ALIGN(addr);
359 for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
360 /* At this point: (!vmm || addr < vmm->vm_end). */
361 if (TASK_SIZE - len < addr)
362 return 0;
363 if (!vmm || addr + len <= vmm->vm_start)
364 return addr;
365 addr = vmm->vm_end;
369 #define vm_avl_empty (struct vm_area_struct *) NULL
371 #include "mmap_avl.c"
373 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
374 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
376 struct vm_area_struct *vma = NULL;
378 if (mm) {
379 /* Check the cache first. */
380 /* (Cache hit rate is typically around 35%.) */
381 vma = mm->mmap_cache;
382 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
383 if (!mm->mmap_avl) {
384 /* Go through the linear list. */
385 vma = mm->mmap;
386 while (vma && vma->vm_end <= addr)
387 vma = vma->vm_next;
388 } else {
389 /* Then go through the AVL tree quickly. */
390 struct vm_area_struct * tree = mm->mmap_avl;
391 vma = NULL;
392 for (;;) {
393 if (tree == vm_avl_empty)
394 break;
395 if (tree->vm_end > addr) {
396 vma = tree;
397 if (tree->vm_start <= addr)
398 break;
399 tree = tree->vm_avl_left;
400 } else
401 tree = tree->vm_avl_right;
404 if (vma)
405 mm->mmap_cache = vma;
408 return vma;
411 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
412 struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
413 struct vm_area_struct **pprev)
415 if (mm) {
416 if (!mm->mmap_avl) {
417 /* Go through the linear list. */
418 struct vm_area_struct * prev = NULL;
419 struct vm_area_struct * vma = mm->mmap;
420 while (vma && vma->vm_end <= addr) {
421 prev = vma;
422 vma = vma->vm_next;
424 *pprev = prev;
425 return vma;
426 } else {
427 /* Go through the AVL tree quickly. */
428 struct vm_area_struct * vma = NULL;
429 struct vm_area_struct * last_turn_right = NULL;
430 struct vm_area_struct * prev = NULL;
431 struct vm_area_struct * tree = mm->mmap_avl;
432 for (;;) {
433 if (tree == vm_avl_empty)
434 break;
435 if (tree->vm_end > addr) {
436 vma = tree;
437 prev = last_turn_right;
438 if (tree->vm_start <= addr)
439 break;
440 tree = tree->vm_avl_left;
441 } else {
442 last_turn_right = tree;
443 tree = tree->vm_avl_right;
446 if (vma) {
447 if (vma->vm_avl_left != vm_avl_empty) {
448 prev = vma->vm_avl_left;
449 while (prev->vm_avl_right != vm_avl_empty)
450 prev = prev->vm_avl_right;
452 if ((prev ? prev->vm_next : mm->mmap) != vma)
453 printk("find_vma_prev: tree inconsistent with list\n");
454 *pprev = prev;
455 return vma;
459 *pprev = NULL;
460 return NULL;
463 struct vm_area_struct * find_extend_vma(struct task_struct * tsk, unsigned long addr)
465 struct vm_area_struct * vma;
466 unsigned long start;
468 addr &= PAGE_MASK;
469 vma = find_vma(tsk->mm,addr);
470 if (!vma)
471 return NULL;
472 if (vma->vm_start <= addr)
473 return vma;
474 if (!(vma->vm_flags & VM_GROWSDOWN))
475 return NULL;
476 start = vma->vm_start;
477 if (expand_stack(vma, addr))
478 return NULL;
479 if (vma->vm_flags & VM_LOCKED) {
480 make_pages_present(addr, start);
482 return vma;
485 /* Normal function to fix up a mapping
486 * This function is the default for when an area has no specific
487 * function. This may be used as part of a more specific routine.
488 * This function works out what part of an area is affected and
489 * adjusts the mapping information. Since the actual page
490 * manipulation is done in do_mmap(), none need be done here,
491 * though it would probably be more appropriate.
493 * By the time this function is called, the area struct has been
494 * removed from the process mapping list, so it needs to be
495 * reinserted if necessary.
497 * The 4 main cases are:
498 * Unmapping the whole area
499 * Unmapping from the start of the segment to a point in it
500 * Unmapping from an intermediate point to the end
501 * Unmapping between to intermediate points, making a hole.
503 * Case 4 involves the creation of 2 new areas, for each side of
504 * the hole. If possible, we reuse the existing area rather than
505 * allocate a new one, and the return indicates whether the old
506 * area was reused.
508 static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
509 unsigned long addr, size_t len, struct vm_area_struct *extra)
511 struct vm_area_struct *mpnt;
512 unsigned long end = addr + len;
514 area->vm_mm->total_vm -= len >> PAGE_SHIFT;
515 if (area->vm_flags & VM_LOCKED)
516 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
518 /* Unmapping the whole area. */
519 if (addr == area->vm_start && end == area->vm_end) {
520 if (area->vm_ops && area->vm_ops->close)
521 area->vm_ops->close(area);
522 if (area->vm_file)
523 fput(area->vm_file);
524 kmem_cache_free(vm_area_cachep, area);
525 return extra;
528 /* Work out to one of the ends. */
529 if (end == area->vm_end) {
530 area->vm_end = addr;
531 vmlist_modify_lock(current->mm);
532 } else if (addr == area->vm_start) {
533 area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
534 area->vm_start = end;
535 vmlist_modify_lock(current->mm);
536 } else {
537 /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
538 /* Add end mapping -- leave beginning for below */
539 mpnt = extra;
540 extra = NULL;
542 mpnt->vm_mm = area->vm_mm;
543 mpnt->vm_start = end;
544 mpnt->vm_end = area->vm_end;
545 mpnt->vm_page_prot = area->vm_page_prot;
546 mpnt->vm_flags = area->vm_flags;
547 mpnt->vm_ops = area->vm_ops;
548 mpnt->vm_pgoff = area->vm_pgoff;
549 area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
550 mpnt->vm_file = area->vm_file;
551 mpnt->vm_private_data = area->vm_private_data;
552 if (mpnt->vm_file)
553 get_file(mpnt->vm_file);
554 if (mpnt->vm_ops && mpnt->vm_ops->open)
555 mpnt->vm_ops->open(mpnt);
556 area->vm_end = addr; /* Truncate area */
557 vmlist_modify_lock(current->mm);
558 insert_vm_struct(current->mm, mpnt);
561 insert_vm_struct(current->mm, area);
562 vmlist_modify_unlock(current->mm);
563 return extra;
567 * Try to free as many page directory entries as we can,
568 * without having to work very hard at actually scanning
569 * the page tables themselves.
571 * Right now we try to free page tables if we have a nice
572 * PGDIR-aligned area that got free'd up. We could be more
573 * granular if we want to, but this is fast and simple,
574 * and covers the bad cases.
576 * "prev", if it exists, points to a vma before the one
577 * we just free'd - but there's no telling how much before.
579 static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
580 unsigned long start, unsigned long end)
582 unsigned long first = start & PGDIR_MASK;
583 unsigned long last = (end + PGDIR_SIZE - 1) & PGDIR_MASK;
585 if (!prev) {
586 prev = mm->mmap;
587 if (!prev)
588 goto no_mmaps;
589 if (prev->vm_end > start) {
590 if (last > prev->vm_start)
591 last = prev->vm_start;
592 goto no_mmaps;
595 for (;;) {
596 struct vm_area_struct *next = prev->vm_next;
598 if (next) {
599 if (next->vm_start < start) {
600 prev = next;
601 continue;
603 if (last > next->vm_start)
604 last = next->vm_start;
606 if (prev->vm_end > first)
607 first = prev->vm_end + PGDIR_SIZE - 1;
608 break;
610 no_mmaps:
611 first = first >> PGDIR_SHIFT;
612 last = last >> PGDIR_SHIFT;
613 if (last > first)
614 clear_page_tables(mm, first, last-first);
617 /* Munmap is split into 2 main parts -- this part which finds
618 * what needs doing, and the areas themselves, which do the
619 * work. This now handles partial unmappings.
620 * Jeremy Fitzhardine <jeremy@sw.oz.au>
622 int do_munmap(unsigned long addr, size_t len)
624 struct mm_struct * mm;
625 struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
627 if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
628 return -EINVAL;
630 if ((len = PAGE_ALIGN(len)) == 0)
631 return -EINVAL;
633 /* Check if this memory area is ok - put it on the temporary
634 * list if so.. The checks here are pretty simple --
635 * every area affected in some way (by any overlap) is put
636 * on the list. If nothing is put on, nothing is affected.
638 mm = current->mm;
639 mpnt = find_vma_prev(mm, addr, &prev);
640 if (!mpnt)
641 return 0;
642 /* we have addr < mpnt->vm_end */
644 if (mpnt->vm_start >= addr+len)
645 return 0;
647 /* If we'll make "hole", check the vm areas limit */
648 if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
649 && mm->map_count >= MAX_MAP_COUNT)
650 return -ENOMEM;
653 * We may need one additional vma to fix up the mappings ...
654 * and this is the last chance for an easy error exit.
656 extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
657 if (!extra)
658 return -ENOMEM;
660 npp = (prev ? &prev->vm_next : &mm->mmap);
661 free = NULL;
662 vmlist_modify_lock(mm);
663 for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
664 *npp = mpnt->vm_next;
665 mpnt->vm_next = free;
666 free = mpnt;
667 if (mm->mmap_avl)
668 avl_remove(mpnt, &mm->mmap_avl);
670 mm->mmap_cache = NULL; /* Kill the cache. */
671 vmlist_modify_unlock(mm);
673 /* Ok - we have the memory areas we should free on the 'free' list,
674 * so release them, and unmap the page range..
675 * If the one of the segments is only being partially unmapped,
676 * it will put new vm_area_struct(s) into the address space.
678 while ((mpnt = free) != NULL) {
679 unsigned long st, end, size;
681 free = free->vm_next;
683 st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
684 end = addr+len;
685 end = end > mpnt->vm_end ? mpnt->vm_end : end;
686 size = end - st;
688 lock_kernel();
689 if (mpnt->vm_ops && mpnt->vm_ops->unmap)
690 mpnt->vm_ops->unmap(mpnt, st, size);
691 unlock_kernel();
693 remove_shared_vm_struct(mpnt);
694 mm->map_count--;
696 flush_cache_range(mm, st, end);
697 zap_page_range(mm, st, size);
698 flush_tlb_range(mm, st, end);
701 * Fix the mapping, and free the old area if it wasn't reused.
703 extra = unmap_fixup(mpnt, st, size, extra);
706 /* Release the extra vma struct if it wasn't used */
707 if (extra)
708 kmem_cache_free(vm_area_cachep, extra);
710 free_pgtables(mm, prev, addr, addr+len);
712 return 0;
715 asmlinkage long sys_munmap(unsigned long addr, size_t len)
717 int ret;
719 down(&current->mm->mmap_sem);
720 ret = do_munmap(addr, len);
721 up(&current->mm->mmap_sem);
722 return ret;
726 * this is really a simplified "do_mmap". it only handles
727 * anonymous maps. eventually we may be able to do some
728 * brk-specific accounting here.
730 unsigned long do_brk(unsigned long addr, unsigned long len)
732 struct mm_struct * mm = current->mm;
733 struct vm_area_struct * vma;
734 unsigned long flags, retval;
736 len = PAGE_ALIGN(len);
737 if (!len)
738 return addr;
741 * mlock MCL_FUTURE?
743 if (mm->def_flags & VM_LOCKED) {
744 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
745 locked += len;
746 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
747 return -EAGAIN;
751 * Clear old maps. this also does some error checking for us
753 retval = do_munmap(addr, len);
754 if (retval != 0)
755 return retval;
757 /* Check against address space limits *after* clearing old maps... */
758 if ((mm->total_vm << PAGE_SHIFT) + len
759 > current->rlim[RLIMIT_AS].rlim_cur)
760 return -ENOMEM;
762 if (mm->map_count > MAX_MAP_COUNT)
763 return -ENOMEM;
765 if (!vm_enough_memory(len >> PAGE_SHIFT))
766 return -ENOMEM;
769 * create a vma struct for an anonymous mapping
771 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
772 if (!vma)
773 return -ENOMEM;
775 vma->vm_mm = mm;
776 vma->vm_start = addr;
777 vma->vm_end = addr + len;
778 vma->vm_flags = vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC,
779 MAP_FIXED|MAP_PRIVATE) | mm->def_flags;
781 vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
782 vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
783 vma->vm_ops = NULL;
784 vma->vm_pgoff = 0;
785 vma->vm_file = NULL;
786 vma->vm_private_data = NULL;
789 * merge_segments may merge our vma, so we can't refer to it
790 * after the call. Save the values we need now ...
792 flags = vma->vm_flags;
793 addr = vma->vm_start;
795 vmlist_modify_lock(mm);
796 insert_vm_struct(mm, vma);
797 merge_segments(mm, vma->vm_start, vma->vm_end);
798 vmlist_modify_unlock(mm);
800 mm->total_vm += len >> PAGE_SHIFT;
801 if (flags & VM_LOCKED) {
802 mm->locked_vm += len >> PAGE_SHIFT;
803 make_pages_present(addr, addr + len);
805 return addr;
808 /* Build the AVL tree corresponding to the VMA list. */
809 void build_mmap_avl(struct mm_struct * mm)
811 struct vm_area_struct * vma;
813 mm->mmap_avl = NULL;
814 for (vma = mm->mmap; vma; vma = vma->vm_next)
815 avl_insert(vma, &mm->mmap_avl);
818 /* Release all mmaps. */
819 void exit_mmap(struct mm_struct * mm)
821 struct vm_area_struct * mpnt;
823 release_segments(mm);
824 mpnt = mm->mmap;
825 vmlist_modify_lock(mm);
826 mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
827 vmlist_modify_unlock(mm);
828 mm->rss = 0;
829 mm->total_vm = 0;
830 mm->locked_vm = 0;
831 while (mpnt) {
832 struct vm_area_struct * next = mpnt->vm_next;
833 unsigned long start = mpnt->vm_start;
834 unsigned long end = mpnt->vm_end;
835 unsigned long size = end - start;
837 if (mpnt->vm_ops) {
838 if (mpnt->vm_ops->unmap)
839 mpnt->vm_ops->unmap(mpnt, start, size);
840 if (mpnt->vm_ops->close)
841 mpnt->vm_ops->close(mpnt);
843 mm->map_count--;
844 remove_shared_vm_struct(mpnt);
845 zap_page_range(mm, start, size);
846 if (mpnt->vm_file)
847 fput(mpnt->vm_file);
848 kmem_cache_free(vm_area_cachep, mpnt);
849 mpnt = next;
852 /* This is just debugging */
853 if (mm->map_count)
854 printk("exit_mmap: map count is %d\n", mm->map_count);
856 clear_page_tables(mm, 0, USER_PTRS_PER_PGD);
859 /* Insert vm structure into process list sorted by address
860 * and into the inode's i_mmap ring.
862 void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
864 struct vm_area_struct **pprev;
865 struct file * file;
867 if (!mm->mmap_avl) {
868 pprev = &mm->mmap;
869 while (*pprev && (*pprev)->vm_start <= vmp->vm_start)
870 pprev = &(*pprev)->vm_next;
871 } else {
872 struct vm_area_struct *prev, *next;
873 avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
874 pprev = (prev ? &prev->vm_next : &mm->mmap);
875 if (*pprev != next)
876 printk("insert_vm_struct: tree inconsistent with list\n");
878 vmp->vm_next = *pprev;
879 *pprev = vmp;
881 mm->map_count++;
882 if (mm->map_count >= AVL_MIN_MAP_COUNT && !mm->mmap_avl)
883 build_mmap_avl(mm);
885 file = vmp->vm_file;
886 if (file) {
887 struct inode * inode = file->f_dentry->d_inode;
888 if (vmp->vm_flags & VM_DENYWRITE)
889 atomic_dec(&inode->i_writecount);
891 /* insert vmp into inode's share list */
892 spin_lock(&inode->i_shared_lock);
893 if((vmp->vm_next_share = inode->i_mmap) != NULL)
894 inode->i_mmap->vm_pprev_share = &vmp->vm_next_share;
895 inode->i_mmap = vmp;
896 vmp->vm_pprev_share = &inode->i_mmap;
897 spin_unlock(&inode->i_shared_lock);
901 /* Merge the list of memory segments if possible.
902 * Redundant vm_area_structs are freed.
903 * This assumes that the list is ordered by address.
904 * We don't need to traverse the entire list, only those segments
905 * which intersect or are adjacent to a given interval.
907 * We must already hold the mm semaphore when we get here..
909 void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
911 struct vm_area_struct *prev, *mpnt, *next, *prev1;
913 mpnt = find_vma_prev(mm, start_addr, &prev1);
914 if (!mpnt)
915 return;
917 if (prev1) {
918 prev = prev1;
919 } else {
920 prev = mpnt;
921 mpnt = mpnt->vm_next;
923 mm->mmap_cache = NULL; /* Kill the cache. */
925 /* prev and mpnt cycle through the list, as long as
926 * start_addr < mpnt->vm_end && prev->vm_start < end_addr
928 for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) {
929 next = mpnt->vm_next;
931 /* To share, we must have the same file, operations.. */
932 if ((mpnt->vm_file != prev->vm_file)||
933 (mpnt->vm_private_data != prev->vm_private_data) ||
934 (mpnt->vm_ops != prev->vm_ops) ||
935 (mpnt->vm_flags != prev->vm_flags) ||
936 (prev->vm_end != mpnt->vm_start))
937 continue;
940 * If we have a file or it's a shared memory area
941 * the offsets must be contiguous..
943 if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) {
944 unsigned long off = prev->vm_pgoff;
945 off += (prev->vm_end - prev->vm_start) >> PAGE_SHIFT;
946 if (off != mpnt->vm_pgoff)
947 continue;
950 /* merge prev with mpnt and set up pointers so the new
951 * big segment can possibly merge with the next one.
952 * The old unused mpnt is freed.
954 if (mm->mmap_avl)
955 avl_remove(mpnt, &mm->mmap_avl);
956 prev->vm_end = mpnt->vm_end;
957 prev->vm_next = mpnt->vm_next;
958 if (mpnt->vm_ops && mpnt->vm_ops->close) {
959 mpnt->vm_pgoff += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
960 mpnt->vm_start = mpnt->vm_end;
961 vmlist_modify_unlock(mm);
962 mpnt->vm_ops->close(mpnt);
963 vmlist_modify_lock(mm);
965 mm->map_count--;
966 remove_shared_vm_struct(mpnt);
967 if (mpnt->vm_file)
968 fput(mpnt->vm_file);
969 kmem_cache_free(vm_area_cachep, mpnt);
970 mpnt = prev;
974 void __init vma_init(void)
976 vm_area_cachep = kmem_cache_create("vm_area_struct",
977 sizeof(struct vm_area_struct),
978 0, SLAB_HWCACHE_ALIGN,
979 NULL, NULL);
980 if(!vm_area_cachep)
981 panic("vma_init: Cannot alloc vm_area_struct cache.");
983 mm_cachep = kmem_cache_create("mm_struct",
984 sizeof(struct mm_struct),
985 0, SLAB_HWCACHE_ALIGN,
986 NULL, NULL);
987 if(!mm_cachep)
988 panic("vma_init: Cannot alloc mm_struct cache.");