mm/msync.c

   1 /*
   2  *      linux/mm/msync.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * The msync() system call.
   9  */
  10 #include <linux/slab.h>
  11 #include <linux/pagemap.h>
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/mman.h>
  15 #include <linux/hugetlb.h>
  16 #include <linux/writeback.h>
  17 #include <linux/file.h>
  18 #include <linux/syscalls.h>
  19
  20 #include <asm/pgtable.h>
  21 #include <asm/tlbflush.h>
  22
  23 static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  24                                 unsigned long addr, unsigned long end)
  25 {
  26         pte_t *pte;
  27         spinlock_t *ptl;
  28         int progress = 0;
  29         unsigned long ret = 0;
  30
  31 again:
  32         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  33         do {
  34                 struct page *page;
  35
  36                 if (progress >= 64) {
  37                         progress = 0;
  38                         if (need_resched() || need_lockbreak(ptl))
  39                                 break;
  40                 }
  41                 progress++;
  42                 if (!pte_present(*pte))
  43                         continue;
  44                 if (!pte_maybe_dirty(*pte))
  45                         continue;
  46                 page = vm_normal_page(vma, addr, *pte);
  47                 if (!page)
  48                         continue;
  49                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
  50                                 page_test_and_clear_dirty(page))
  51                         ret += set_page_dirty(page);
  52                 progress += 3;
  53         } while (pte++, addr += PAGE_SIZE, addr != end);
  54         pte_unmap_unlock(pte - 1, ptl);
  55         cond_resched();
  56         if (addr != end)
  57                 goto again;
  58         return ret;
  59 }
  60
  61 static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
  62                         pud_t *pud, unsigned long addr, unsigned long end)
  63 {
  64         pmd_t *pmd;
  65         unsigned long next;
  66         unsigned long ret = 0;
  67
  68         pmd = pmd_offset(pud, addr);
  69         do {
  70                 next = pmd_addr_end(addr, end);
  71                 if (pmd_none_or_clear_bad(pmd))
  72                         continue;
  73                 ret += msync_pte_range(vma, pmd, addr, next);
  74         } while (pmd++, addr = next, addr != end);
  75         return ret;
  76 }
  77
  78 static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
  79                         pgd_t *pgd, unsigned long addr, unsigned long end)
  80 {
  81         pud_t *pud;
  82         unsigned long next;
  83         unsigned long ret = 0;
  84
  85         pud = pud_offset(pgd, addr);
  86         do {
  87                 next = pud_addr_end(addr, end);
  88                 if (pud_none_or_clear_bad(pud))
  89                         continue;
  90                 ret += msync_pmd_range(vma, pud, addr, next);
  91         } while (pud++, addr = next, addr != end);
  92         return ret;
  93 }
  94
  95 static unsigned long msync_page_range(struct vm_area_struct *vma,
  96                                 unsigned long addr, unsigned long end)
  97 {
  98         pgd_t *pgd;
  99         unsigned long next;
 100         unsigned long ret = 0;
 101
 102         /* For hugepages we can't go walking the page table normally,
 103          * but that's ok, hugetlbfs is memory based, so we don't need
 104          * to do anything more on an msync().
 105          */
 106         if (vma->vm_flags & VM_HUGETLB)
 107                 return 0;
 108
 109         BUG_ON(addr >= end);
 110         pgd = pgd_offset(vma->vm_mm, addr);
 111         flush_cache_range(vma, addr, end);
 112         do {
 113                 next = pgd_addr_end(addr, end);
 114                 if (pgd_none_or_clear_bad(pgd))
 115                         continue;
 116                 ret += msync_pud_range(vma, pgd, addr, next);
 117         } while (pgd++, addr = next, addr != end);
 118         return ret;
 119 }
 120
 121 /*
 122  * MS_SYNC syncs the entire file - including mappings.
 123  *
 124  * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
 125  * marks the relevant pages dirty.  The application may now run fsync() to
 126  * write out the dirty pages and wait on the writeout and check the result.
 127  * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
 128  * async writeout immediately.
 129  * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
 130  * applications.
 131  */
 132 static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
 133                         unsigned long end, int flags,
 134                         unsigned long *nr_pages_dirtied)
 135 {
 136         struct file *file = vma->vm_file;
 137
 138         if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
 139                 return -EBUSY;
 140
 141         if (file && (vma->vm_flags & VM_SHARED))
 142                 *nr_pages_dirtied = msync_page_range(vma, addr, end);
 143         return 0;
 144 }
 145
 146 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 147 {
 148         unsigned long end;
 149         struct vm_area_struct *vma;
 150         int unmapped_error = 0;
 151         int error = -EINVAL;
 152         int done = 0;
 153
 154         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 155                 goto out;
 156         if (start & ~PAGE_MASK)
 157                 goto out;
 158         if ((flags & MS_ASYNC) && (flags & MS_SYNC))
 159                 goto out;
 160         error = -ENOMEM;
 161         len = (len + ~PAGE_MASK) & PAGE_MASK;
 162         end = start + len;
 163         if (end < start)
 164                 goto out;
 165         error = 0;
 166         if (end == start)
 167                 goto out;
 168         /*
 169          * If the interval [start,end) covers some unmapped address ranges,
 170          * just ignore them, but return -ENOMEM at the end.
 171          */
 172         down_read(&current->mm->mmap_sem);
 173         vma = find_vma(current->mm, start);
 174         if (!vma) {
 175                 error = -ENOMEM;
 176                 goto out_unlock;
 177         }
 178         do {
 179                 unsigned long nr_pages_dirtied = 0;
 180                 struct file *file;
 181
 182                 /* Here start < vma->vm_end. */
 183                 if (start < vma->vm_start) {
 184                         unmapped_error = -ENOMEM;
 185                         start = vma->vm_start;
 186                 }
 187                 /* Here vma->vm_start <= start < vma->vm_end. */
 188                 if (end <= vma->vm_end) {
 189                         if (start < end) {
 190                                 error = msync_interval(vma, start, end, flags,
 191                                                         &nr_pages_dirtied);
 192                                 if (error)
 193                                         goto out_unlock;
 194                         }
 195                         error = unmapped_error;
 196                         done = 1;
 197                 } else {
 198                         /* Here vma->vm_start <= start < vma->vm_end < end. */
 199                         error = msync_interval(vma, start, vma->vm_end, flags,
 200                                                 &nr_pages_dirtied);
 201                         if (error)
 202                                 goto out_unlock;
 203                 }
 204                 file = vma->vm_file;
 205                 start = vma->vm_end;
 206                 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
 207                         get_file(file);
 208                         up_read(&current->mm->mmap_sem);
 209                         balance_dirty_pages_ratelimited_nr(file->f_mapping,
 210                                                         nr_pages_dirtied);
 211                         fput(file);
 212                         down_read(&current->mm->mmap_sem);
 213                         vma = find_vma(current->mm, start);
 214                 } else if ((flags & MS_SYNC) && file &&
 215                                 (vma->vm_flags & VM_SHARED)) {
 216                         get_file(file);
 217                         up_read(&current->mm->mmap_sem);
 218                         error = do_fsync(file, 0);
 219                         fput(file);
 220                         down_read(&current->mm->mmap_sem);
 221                         if (error)
 222                                 goto out_unlock;
 223                         vma = find_vma(current->mm, start);
 224                 } else {
 225                         vma = vma->vm_next;
 226                 }
 227         } while (vma && !done);
 228 out_unlock:
 229         up_read(&current->mm->mmap_sem);
 230 out:
 231         return error;
 232 }