6 #include <linux/stat.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
10 #include <linux/slab.h>
11 #include <linux/shm.h>
12 #include <linux/errno.h>
13 #include <linux/mman.h>
14 #include <linux/string.h>
15 #include <linux/pagemap.h>
16 #include <linux/swap.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/init.h>
20 #include <linux/file.h>
22 #include <asm/uaccess.h>
23 #include <asm/system.h>
24 #include <asm/pgtable.h>
26 /* description of effects of mapping type and prot in current implementation.
27 * this is due to the limited x86 page protection hardware. The expected
28 * behavior is in parens:
31 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
32 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
33 * w: (no) no w: (no) no w: (yes) yes w: (no) no
34 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
36 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
37 * w: (no) no w: (no) no w: (copy) copy w: (no) no
38 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
41 pgprot_t protection_map
[16] = {
42 __P000
, __P001
, __P010
, __P011
, __P100
, __P101
, __P110
, __P111
,
43 __S000
, __S001
, __S010
, __S011
, __S100
, __S101
, __S110
, __S111
46 /* SLAB cache for vm_area_struct's. */
47 kmem_cache_t
*vm_area_cachep
;
49 int sysctl_overcommit_memory
;
51 /* Check that a process has enough memory to allocate a
52 * new virtual mapping.
54 static inline int vm_enough_memory(long pages
)
56 /* Stupid algorithm to decide if we have enough memory: while
57 * simple, it hopefully works in most obvious cases.. Easy to
58 * fool it, but this should catch most mistakes.
62 /* Sometimes we want to use more memory than we have. */
63 if (sysctl_overcommit_memory
)
66 freepages
= buffermem
>> PAGE_SHIFT
;
67 freepages
+= page_cache_size
;
69 freepages
+= nr_free_pages
;
70 freepages
+= nr_swap_pages
;
71 freepages
-= num_physpages
>> 4;
72 return freepages
> pages
;
75 /* Remove one vm structure from the inode's i_mmap ring. */
76 static inline void remove_shared_vm_struct(struct vm_area_struct
*vma
)
78 struct file
* file
= vma
->vm_file
;
81 if (vma
->vm_flags
& VM_DENYWRITE
)
82 file
->f_dentry
->d_inode
->i_writecount
++;
83 if(vma
->vm_next_share
)
84 vma
->vm_next_share
->vm_pprev_share
= vma
->vm_pprev_share
;
85 *vma
->vm_pprev_share
= vma
->vm_next_share
;
89 asmlinkage
unsigned long sys_brk(unsigned long brk
)
91 unsigned long rlim
, retval
;
92 unsigned long newbrk
, oldbrk
;
93 struct mm_struct
*mm
= current
->mm
;
96 if (brk
< mm
->end_code
)
98 newbrk
= PAGE_ALIGN(brk
);
99 oldbrk
= PAGE_ALIGN(mm
->brk
);
100 if (oldbrk
== newbrk
)
103 /* Always allow shrinking brk. */
104 if (brk
<= mm
->brk
) {
105 if (!do_munmap(newbrk
, oldbrk
-newbrk
))
110 /* Check against rlimit and stack.. */
111 rlim
= current
->rlim
[RLIMIT_DATA
].rlim_cur
;
112 if (rlim
>= RLIM_INFINITY
)
114 if (brk
- mm
->end_code
> rlim
)
117 /* Check against existing mmap mappings. */
118 if (find_vma_intersection(mm
, oldbrk
, newbrk
+PAGE_SIZE
))
121 /* Check if we have enough memory.. */
122 if (!vm_enough_memory((newbrk
-oldbrk
) >> PAGE_SHIFT
))
125 /* Ok, looks good - let it rip. */
126 if (do_mmap(NULL
, oldbrk
, newbrk
-oldbrk
,
127 PROT_READ
|PROT_WRITE
|PROT_EXEC
,
128 MAP_FIXED
|MAP_PRIVATE
, 0) != oldbrk
)
138 /* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
139 * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
142 static inline unsigned long vm_flags(unsigned long prot
, unsigned long flags
)
144 #define _trans(x,bit1,bit2) \
145 ((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
147 unsigned long prot_bits
, flag_bits
;
149 _trans(prot
, PROT_READ
, VM_READ
) |
150 _trans(prot
, PROT_WRITE
, VM_WRITE
) |
151 _trans(prot
, PROT_EXEC
, VM_EXEC
);
153 _trans(flags
, MAP_GROWSDOWN
, VM_GROWSDOWN
) |
154 _trans(flags
, MAP_DENYWRITE
, VM_DENYWRITE
) |
155 _trans(flags
, MAP_EXECUTABLE
, VM_EXECUTABLE
);
156 return prot_bits
| flag_bits
;
160 unsigned long do_mmap(struct file
* file
, unsigned long addr
, unsigned long len
,
161 unsigned long prot
, unsigned long flags
, unsigned long off
)
163 struct mm_struct
* mm
= current
->mm
;
164 struct vm_area_struct
* vma
;
165 int correct_wcount
= 0, error
;
167 if ((len
= PAGE_ALIGN(len
)) == 0)
170 if (len
> TASK_SIZE
|| addr
> TASK_SIZE
-len
)
173 /* offset overflow? */
177 /* Too many mappings? */
178 if (mm
->map_count
> MAX_MAP_COUNT
)
181 /* mlock MCL_FUTURE? */
182 if (mm
->def_flags
& VM_LOCKED
) {
183 unsigned long locked
= mm
->locked_vm
<< PAGE_SHIFT
;
185 if (locked
> current
->rlim
[RLIMIT_MEMLOCK
].rlim_cur
)
189 /* Do simple checking here so the lower-level routines won't have
190 * to. we assume access permissions have been handled by the open
191 * of the memory object, so we don't do any here.
194 switch (flags
& MAP_TYPE
) {
196 if ((prot
& PROT_WRITE
) && !(file
->f_mode
& 2))
199 /* make sure there are no mandatory locks on the file. */
200 if (locks_verify_locked(file
->f_dentry
->d_inode
))
204 if (!(file
->f_mode
& 1))
211 } else if ((flags
& MAP_TYPE
) != MAP_PRIVATE
)
214 /* Obtain the address to map to. we verify (or select) it and ensure
215 * that it represents a valid section of the address space.
217 if (flags
& MAP_FIXED
) {
218 if (addr
& ~PAGE_MASK
)
221 addr
= get_unmapped_area(addr
, len
);
226 /* Determine the object being mapped and call the appropriate
227 * specific mapper. the address has already been validated, but
228 * not unmapped, but the maps are removed from the list.
230 if (file
&& (!file
->f_op
|| !file
->f_op
->mmap
))
233 vma
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
238 vma
->vm_start
= addr
;
239 vma
->vm_end
= addr
+ len
;
240 vma
->vm_flags
= vm_flags(prot
,flags
) | mm
->def_flags
;
243 if (file
->f_mode
& 1)
244 vma
->vm_flags
|= VM_MAYREAD
| VM_MAYWRITE
| VM_MAYEXEC
;
245 if (flags
& MAP_SHARED
) {
246 vma
->vm_flags
|= VM_SHARED
| VM_MAYSHARE
;
248 /* This looks strange, but when we don't have the file open
249 * for writing, we can demote the shared mapping to a simpler
250 * private mapping. That also takes care of a security hole
251 * with ptrace() writing to a shared mapping without write
254 * We leave the VM_MAYSHARE bit on, just to get correct output
255 * from /proc/xxx/maps..
257 if (!(file
->f_mode
& 2))
258 vma
->vm_flags
&= ~(VM_MAYWRITE
| VM_SHARED
);
261 vma
->vm_flags
|= VM_MAYREAD
| VM_MAYWRITE
| VM_MAYEXEC
;
262 vma
->vm_page_prot
= protection_map
[vma
->vm_flags
& 0x0f];
264 vma
->vm_offset
= off
;
270 if (do_munmap(addr
, len
))
273 /* Check against address space limit. */
274 if ((mm
->total_vm
<< PAGE_SHIFT
) + len
275 > current
->rlim
[RLIMIT_AS
].rlim_cur
)
278 /* Private writable mapping? Check memory availability.. */
279 if ((vma
->vm_flags
& (VM_SHARED
| VM_WRITE
)) == VM_WRITE
&&
280 !(flags
& MAP_NORESERVE
) &&
281 !vm_enough_memory(len
>> PAGE_SHIFT
))
286 if (vma
->vm_flags
& VM_DENYWRITE
) {
287 if (file
->f_dentry
->d_inode
->i_writecount
> 0)
290 /* f_op->mmap might possibly sleep
291 * (generic_file_mmap doesn't, but other code
292 * might). In any case, this takes care of any
293 * race that this might cause.
295 file
->f_dentry
->d_inode
->i_writecount
--;
300 error
= file
->f_op
->mmap(file
, vma
);
303 /* Fix up the count if necessary, then check for an error */
305 file
->f_dentry
->d_inode
->i_writecount
++;
310 * merge_segments may merge our vma, so we can't refer to it
311 * after the call. Save the values we need now ...
313 flags
= vma
->vm_flags
;
314 addr
= vma
->vm_start
; /* can addr have changed?? */
315 insert_vm_struct(mm
, vma
);
316 merge_segments(mm
, vma
->vm_start
, vma
->vm_end
);
318 mm
->total_vm
+= len
>> PAGE_SHIFT
;
319 if ((flags
& VM_LOCKED
) && !(flags
& VM_IO
)) {
320 unsigned long start
= addr
;
321 mm
->locked_vm
+= len
>> PAGE_SHIFT
;
324 get_user(c
,(char *) start
);
327 __asm__
__volatile__("": :"r" (c
));
333 kmem_cache_free(vm_area_cachep
, vma
);
337 /* Get an address range which is currently unmapped.
338 * For mmap() without MAP_FIXED and shmat() with addr=0.
339 * Return value 0 means ENOMEM.
341 unsigned long get_unmapped_area(unsigned long addr
, unsigned long len
)
343 struct vm_area_struct
* vmm
;
348 addr
= TASK_UNMAPPED_BASE
;
349 addr
= PAGE_ALIGN(addr
);
351 for (vmm
= find_vma(current
->mm
, addr
); ; vmm
= vmm
->vm_next
) {
352 /* At this point: (!vmm || addr < vmm->vm_end). */
353 if (TASK_SIZE
- len
< addr
)
355 if (!vmm
|| addr
+ len
<= vmm
->vm_start
)
361 /* Normal function to fix up a mapping
362 * This function is the default for when an area has no specific
363 * function. This may be used as part of a more specific routine.
364 * This function works out what part of an area is affected and
365 * adjusts the mapping information. Since the actual page
366 * manipulation is done in do_mmap(), none need be done here,
367 * though it would probably be more appropriate.
369 * By the time this function is called, the area struct has been
370 * removed from the process mapping list, so it needs to be
371 * reinserted if necessary.
373 * The 4 main cases are:
374 * Unmapping the whole area
375 * Unmapping from the start of the segment to a point in it
376 * Unmapping from an intermediate point to the end
377 * Unmapping between to intermediate points, making a hole.
379 * Case 4 involves the creation of 2 new areas, for each side of
380 * the hole. If possible, we reuse the existing area rather than
381 * allocate a new one, and the return indicates whether the old
384 static int unmap_fixup(struct vm_area_struct
*area
, unsigned long addr
,
385 size_t len
, struct vm_area_struct
**extra
)
387 struct vm_area_struct
*mpnt
;
388 unsigned long end
= addr
+ len
;
390 area
->vm_mm
->total_vm
-= len
>> PAGE_SHIFT
;
391 if (area
->vm_flags
& VM_LOCKED
)
392 area
->vm_mm
->locked_vm
-= len
>> PAGE_SHIFT
;
394 /* Unmapping the whole area. */
395 if (addr
== area
->vm_start
&& end
== area
->vm_end
) {
396 if (area
->vm_ops
&& area
->vm_ops
->close
)
397 area
->vm_ops
->close(area
);
403 /* Work out to one of the ends. */
404 if (end
== area
->vm_end
)
406 else if (addr
== area
->vm_start
) {
407 area
->vm_offset
+= (end
- area
->vm_start
);
408 area
->vm_start
= end
;
410 /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
411 /* Add end mapping -- leave beginning for below */
415 mpnt
->vm_mm
= area
->vm_mm
;
416 mpnt
->vm_start
= end
;
417 mpnt
->vm_end
= area
->vm_end
;
418 mpnt
->vm_page_prot
= area
->vm_page_prot
;
419 mpnt
->vm_flags
= area
->vm_flags
;
420 mpnt
->vm_ops
= area
->vm_ops
;
421 mpnt
->vm_offset
= area
->vm_offset
+ (end
- area
->vm_start
);
422 mpnt
->vm_file
= area
->vm_file
;
424 mpnt
->vm_file
->f_count
++;
425 if (mpnt
->vm_ops
&& mpnt
->vm_ops
->open
)
426 mpnt
->vm_ops
->open(mpnt
);
427 area
->vm_end
= addr
; /* Truncate area */
428 insert_vm_struct(current
->mm
, mpnt
);
431 /* Close the current area ... */
432 if (area
->vm_ops
&& area
->vm_ops
->close
) {
433 end
= area
->vm_end
; /* save new end */
434 area
->vm_end
= area
->vm_start
;
435 area
->vm_ops
->close(area
);
438 /* ... then reopen and reinsert. */
439 if (area
->vm_ops
&& area
->vm_ops
->open
)
440 area
->vm_ops
->open(area
);
441 insert_vm_struct(current
->mm
, area
);
445 asmlinkage
int sys_munmap(unsigned long addr
, size_t len
)
450 ret
= do_munmap(addr
, len
);
455 /* Munmap is split into 2 main parts -- this part which finds
456 * what needs doing, and the areas themselves, which do the
457 * work. This now handles partial unmappings.
458 * Jeremy Fitzhardine <jeremy@sw.oz.au>
460 int do_munmap(unsigned long addr
, size_t len
)
462 struct mm_struct
* mm
;
463 struct vm_area_struct
*mpnt
, *next
, *free
, *extra
;
466 if ((addr
& ~PAGE_MASK
) || addr
> TASK_SIZE
|| len
> TASK_SIZE
-addr
)
469 if ((len
= PAGE_ALIGN(len
)) == 0)
472 /* Check if this memory area is ok - put it on the temporary
473 * list if so.. The checks here are pretty simple --
474 * every area affected in some way (by any overlap) is put
475 * on the list. If nothing is put on, nothing is affected.
479 while(mpnt
&& mpnt
->vm_end
<= addr
)
480 mpnt
= mpnt
->vm_next
;
485 * We may need one additional vma to fix up the mappings ...
486 * and this is the last chance for an easy error exit.
488 extra
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
492 next
= mpnt
->vm_next
;
494 /* we have mpnt->vm_next = next and addr < mpnt->vm_end */
496 for ( ; mpnt
&& mpnt
->vm_start
< addr
+len
; ) {
497 struct vm_area_struct
*next
= mpnt
->vm_next
;
500 mpnt
->vm_next
->vm_pprev
= mpnt
->vm_pprev
;
501 *mpnt
->vm_pprev
= mpnt
->vm_next
;
503 mpnt
->vm_next
= free
;
508 if (free
&& (free
->vm_start
< addr
) && (free
->vm_end
> addr
+len
)) {
509 if (mm
->map_count
> MAX_MAP_COUNT
) {
510 kmem_cache_free(vm_area_cachep
, extra
);
515 /* Ok - we have the memory areas we should free on the 'free' list,
516 * so release them, and unmap the page range..
517 * If the one of the segments is only being partially unmapped,
518 * it will put new vm_area_struct(s) into the address space.
521 while ((mpnt
= free
) != NULL
) {
522 unsigned long st
, end
, size
;
524 free
= free
->vm_next
;
528 remove_shared_vm_struct(mpnt
);
530 st
= addr
< mpnt
->vm_start
? mpnt
->vm_start
: addr
;
532 end
= end
> mpnt
->vm_end
? mpnt
->vm_end
: end
;
535 if (mpnt
->vm_ops
&& mpnt
->vm_ops
->unmap
)
536 mpnt
->vm_ops
->unmap(mpnt
, st
, size
);
538 flush_cache_range(mm
, st
, end
);
539 zap_page_range(mm
, st
, size
);
540 flush_tlb_range(mm
, st
, end
);
543 * Fix the mapping, and free the old area if it wasn't reused.
545 if (!unmap_fixup(mpnt
, st
, size
, &extra
))
546 kmem_cache_free(vm_area_cachep
, mpnt
);
549 /* Release the extra vma struct if it wasn't used */
551 kmem_cache_free(vm_area_cachep
, extra
);
554 mm
->mmap_cache
= NULL
; /* Kill the cache. */
558 /* Release all mmaps. */
559 void exit_mmap(struct mm_struct
* mm
)
561 struct vm_area_struct
* mpnt
;
564 mm
->mmap
= mm
->mmap_cache
= NULL
;
569 struct vm_area_struct
* next
= mpnt
->vm_next
;
570 unsigned long start
= mpnt
->vm_start
;
571 unsigned long end
= mpnt
->vm_end
;
572 unsigned long size
= end
- start
;
575 if (mpnt
->vm_ops
->unmap
)
576 mpnt
->vm_ops
->unmap(mpnt
, start
, size
);
577 if (mpnt
->vm_ops
->close
)
578 mpnt
->vm_ops
->close(mpnt
);
581 remove_shared_vm_struct(mpnt
);
582 zap_page_range(mm
, start
, size
);
585 kmem_cache_free(vm_area_cachep
, mpnt
);
589 /* This is just debugging */
591 printk("exit_mmap: map count is %d\n", mm
->map_count
);
594 /* Insert vm structure into process list sorted by address
595 * and into the inode's i_mmap ring.
597 void insert_vm_struct(struct mm_struct
*mm
, struct vm_area_struct
*vmp
)
599 struct vm_area_struct
**pprev
= &mm
->mmap
;
604 /* Find where to link it in. */
605 while(*pprev
&& (*pprev
)->vm_start
<= vmp
->vm_start
)
606 pprev
= &(*pprev
)->vm_next
;
609 if((vmp
->vm_next
= *pprev
) != NULL
)
610 (*pprev
)->vm_pprev
= &vmp
->vm_next
;
612 vmp
->vm_pprev
= pprev
;
616 struct inode
* inode
= file
->f_dentry
->d_inode
;
617 if (vmp
->vm_flags
& VM_DENYWRITE
)
618 inode
->i_writecount
--;
620 /* insert vmp into inode's share list */
621 if((vmp
->vm_next_share
= inode
->i_mmap
) != NULL
)
622 inode
->i_mmap
->vm_pprev_share
= &vmp
->vm_next_share
;
624 vmp
->vm_pprev_share
= &inode
->i_mmap
;
628 /* Merge the list of memory segments if possible.
629 * Redundant vm_area_structs are freed.
630 * This assumes that the list is ordered by address.
631 * We don't need to traverse the entire list, only those segments
632 * which intersect or are adjacent to a given interval.
634 void merge_segments (struct mm_struct
* mm
, unsigned long start_addr
, unsigned long end_addr
)
636 struct vm_area_struct
*prev
, *mpnt
, *next
;
642 while(mpnt
&& mpnt
->vm_end
<= start_addr
) {
644 mpnt
= mpnt
->vm_next
;
649 next
= mpnt
->vm_next
;
651 /* we have prev->vm_next == mpnt && mpnt->vm_next = next */
657 /* prev and mpnt cycle through the list, as long as
658 * start_addr < mpnt->vm_end && prev->vm_start < end_addr
660 for ( ; mpnt
&& prev
->vm_start
< end_addr
; prev
= mpnt
, mpnt
= next
) {
661 next
= mpnt
->vm_next
;
663 /* To share, we must have the same file, operations.. */
664 if ((mpnt
->vm_file
!= prev
->vm_file
)||
665 (mpnt
->vm_pte
!= prev
->vm_pte
) ||
666 (mpnt
->vm_ops
!= prev
->vm_ops
) ||
667 (mpnt
->vm_flags
!= prev
->vm_flags
) ||
668 (prev
->vm_end
!= mpnt
->vm_start
))
672 * If we have a file or it's a shared memory area
673 * the offsets must be contiguous..
675 if ((mpnt
->vm_file
!= NULL
) || (mpnt
->vm_flags
& VM_SHM
)) {
676 unsigned long off
= prev
->vm_offset
+prev
->vm_end
-prev
->vm_start
;
677 if (off
!= mpnt
->vm_offset
)
681 /* merge prev with mpnt and set up pointers so the new
682 * big segment can possibly merge with the next one.
683 * The old unused mpnt is freed.
686 mpnt
->vm_next
->vm_pprev
= mpnt
->vm_pprev
;
687 *mpnt
->vm_pprev
= mpnt
->vm_next
;
689 prev
->vm_end
= mpnt
->vm_end
;
690 if (mpnt
->vm_ops
&& mpnt
->vm_ops
->close
) {
691 mpnt
->vm_offset
+= mpnt
->vm_end
- mpnt
->vm_start
;
692 mpnt
->vm_start
= mpnt
->vm_end
;
693 mpnt
->vm_ops
->close(mpnt
);
696 remove_shared_vm_struct(mpnt
);
699 kmem_cache_free(vm_area_cachep
, mpnt
);
702 mm
->mmap_cache
= NULL
; /* Kill the cache. */
707 __initfunc(void vma_init(void))
709 vm_area_cachep
= kmem_cache_create("vm_area_struct",
710 sizeof(struct vm_area_struct
),
711 0, SLAB_HWCACHE_ALIGN
,
714 panic("vma_init: Cannot alloc vm_area_struct cache.");
716 mm_cachep
= kmem_cache_create("mm_struct",
717 sizeof(struct mm_struct
),
718 0, SLAB_HWCACHE_ALIGN
,
721 panic("vma_init: Cannot alloc mm_struct cache.");