mm/mprotect.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  mm/mprotect.c
   4  *
   5  *  (C) Copyright 1994 Linus Torvalds
   6  *  (C) Copyright 2002 Christoph Hellwig
   7  *
   8  *  Address space accounting code       <alan@lxorguk.ukuu.org.uk>
   9  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
  10  */
  11
  12 #include <linux/pagewalk.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/shm.h>
  15 #include <linux/mman.h>
  16 #include <linux/fs.h>
  17 #include <linux/highmem.h>
  18 #include <linux/security.h>
  19 #include <linux/mempolicy.h>
  20 #include <linux/personality.h>
  21 #include <linux/syscalls.h>
  22 #include <linux/swap.h>
  23 #include <linux/swapops.h>
  24 #include <linux/mmu_notifier.h>
  25 #include <linux/migrate.h>
  26 #include <linux/perf_event.h>
  27 #include <linux/pkeys.h>
  28 #include <linux/ksm.h>
  29 #include <linux/uaccess.h>
  30 #include <linux/mm_inline.h>
  31 #include <linux/pgtable.h>
  32 #include <linux/sched/sysctl.h>
  33 #include <linux/userfaultfd_k.h>
  34 #include <linux/memory-tiers.h>
  35 #include <uapi/linux/mman.h>
  36 #include <asm/cacheflush.h>
  37 #include <asm/mmu_context.h>
  38 #include <asm/tlbflush.h>
  39 #include <asm/tlb.h>
  40
  41 #include "internal.h"
  42
  43 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
  44                              pte_t pte)
  45 {
  46         struct page *page;
  47
  48         if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
  49                 return false;
  50
  51         /* Don't touch entries that are not even readable. */
  52         if (pte_protnone(pte))
  53                 return false;
  54
  55         /* Do we need write faults for softdirty tracking? */
  56         if (pte_needs_soft_dirty_wp(vma, pte))
  57                 return false;
  58
  59         /* Do we need write faults for uffd-wp tracking? */
  60         if (userfaultfd_pte_wp(vma, pte))
  61                 return false;
  62
  63         if (!(vma->vm_flags & VM_SHARED)) {
  64                 /*
  65                  * Writable MAP_PRIVATE mapping: We can only special-case on
  66                  * exclusive anonymous pages, because we know that our
  67                  * write-fault handler similarly would map them writable without
  68                  * any additional checks while holding the PT lock.
  69                  */
  70                 page = vm_normal_page(vma, addr, pte);
  71                 return page && PageAnon(page) && PageAnonExclusive(page);
  72         }
  73
  74         VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
  75
  76         /*
  77          * Writable MAP_SHARED mapping: "clean" might indicate that the FS still
  78          * needs a real write-fault for writenotify
  79          * (see vma_wants_writenotify()). If "dirty", the assumption is that the
  80          * FS was already notified and we can simply mark the PTE writable
  81          * just like the write-fault handler would do.
  82          */
  83         return pte_dirty(pte);
  84 }
  85
  86 static long change_pte_range(struct mmu_gather *tlb,
  87                 struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
  88                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
  89 {
  90         pte_t *pte, oldpte;
  91         spinlock_t *ptl;
  92         long pages = 0;
  93         int target_node = NUMA_NO_NODE;
  94         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
  95         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
  96         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  97
  98         tlb_change_page_size(tlb, PAGE_SIZE);
  99         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 100         if (!pte)
 101                 return -EAGAIN;
 102
 103         /* Get target node for single threaded private VMAs */
 104         if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
 105             atomic_read(&vma->vm_mm->mm_users) == 1)
 106                 target_node = numa_node_id();
 107
 108         flush_tlb_batched_pending(vma->vm_mm);
 109         arch_enter_lazy_mmu_mode();
 110         do {
 111                 oldpte = ptep_get(pte);
 112                 if (pte_present(oldpte)) {
 113                         pte_t ptent;
 114
 115                         /*
 116                          * Avoid trapping faults against the zero or KSM
 117                          * pages. See similar comment in change_huge_pmd.
 118                          */
 119                         if (prot_numa) {
 120                                 struct folio *folio;
 121                                 int nid;
 122                                 bool toptier;
 123
 124                                 /* Avoid TLB flush if possible */
 125                                 if (pte_protnone(oldpte))
 126                                         continue;
 127
 128                                 folio = vm_normal_folio(vma, addr, oldpte);
 129                                 if (!folio || folio_is_zone_device(folio) ||
 130                                     folio_test_ksm(folio))
 131                                         continue;
 132
 133                                 /* Also skip shared copy-on-write pages */
 134                                 if (is_cow_mapping(vma->vm_flags) &&
 135                                     (folio_maybe_dma_pinned(folio) ||
 136                                      folio_likely_mapped_shared(folio)))
 137                                         continue;
 138
 139                                 /*
 140                                  * While migration can move some dirty pages,
 141                                  * it cannot move them all from MIGRATE_ASYNC
 142                                  * context.
 143                                  */
 144                                 if (folio_is_file_lru(folio) &&
 145                                     folio_test_dirty(folio))
 146                                         continue;
 147
 148                                 /*
 149                                  * Don't mess with PTEs if page is already on the node
 150                                  * a single-threaded process is running on.
 151                                  */
 152                                 nid = folio_nid(folio);
 153                                 if (target_node == nid)
 154                                         continue;
 155                                 toptier = node_is_toptier(nid);
 156
 157                                 /*
 158                                  * Skip scanning top tier node if normal numa
 159                                  * balancing is disabled
 160                                  */
 161                                 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
 162                                     toptier)
 163                                         continue;
 164                                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
 165                                     !toptier)
 166                                         folio_xchg_access_time(folio,
 167                                                 jiffies_to_msecs(jiffies));
 168                         }
 169
 170                         oldpte = ptep_modify_prot_start(vma, addr, pte);
 171                         ptent = pte_modify(oldpte, newprot);
 172
 173                         if (uffd_wp)
 174                                 ptent = pte_mkuffd_wp(ptent);
 175                         else if (uffd_wp_resolve)
 176                                 ptent = pte_clear_uffd_wp(ptent);
 177
 178                         /*
 179                          * In some writable, shared mappings, we might want
 180                          * to catch actual write access -- see
 181                          * vma_wants_writenotify().
 182                          *
 183                          * In all writable, private mappings, we have to
 184                          * properly handle COW.
 185                          *
 186                          * In both cases, we can sometimes still change PTEs
 187                          * writable and avoid the write-fault handler, for
 188                          * example, if a PTE is already dirty and no other
 189                          * COW or special handling is required.
 190                          */
 191                         if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
 192                             !pte_write(ptent) &&
 193                             can_change_pte_writable(vma, addr, ptent))
 194                                 ptent = pte_mkwrite(ptent, vma);
 195
 196                         ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
 197                         if (pte_needs_flush(oldpte, ptent))
 198                                 tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
 199                         pages++;
 200                 } else if (is_swap_pte(oldpte)) {
 201                         swp_entry_t entry = pte_to_swp_entry(oldpte);
 202                         pte_t newpte;
 203
 204                         if (is_writable_migration_entry(entry)) {
 205                                 struct folio *folio = pfn_swap_entry_folio(entry);
 206
 207                                 /*
 208                                  * A protection check is difficult so
 209                                  * just be safe and disable write
 210                                  */
 211                                 if (folio_test_anon(folio))
 212                                         entry = make_readable_exclusive_migration_entry(
 213                                                              swp_offset(entry));
 214                                 else
 215                                         entry = make_readable_migration_entry(swp_offset(entry));
 216                                 newpte = swp_entry_to_pte(entry);
 217                                 if (pte_swp_soft_dirty(oldpte))
 218                                         newpte = pte_swp_mksoft_dirty(newpte);
 219                         } else if (is_writable_device_private_entry(entry)) {
 220                                 /*
 221                                  * We do not preserve soft-dirtiness. See
 222                                  * copy_nonpresent_pte() for explanation.
 223                                  */
 224                                 entry = make_readable_device_private_entry(
 225                                                         swp_offset(entry));
 226                                 newpte = swp_entry_to_pte(entry);
 227                                 if (pte_swp_uffd_wp(oldpte))
 228                                         newpte = pte_swp_mkuffd_wp(newpte);
 229                         } else if (is_writable_device_exclusive_entry(entry)) {
 230                                 entry = make_readable_device_exclusive_entry(
 231                                                         swp_offset(entry));
 232                                 newpte = swp_entry_to_pte(entry);
 233                                 if (pte_swp_soft_dirty(oldpte))
 234                                         newpte = pte_swp_mksoft_dirty(newpte);
 235                                 if (pte_swp_uffd_wp(oldpte))
 236                                         newpte = pte_swp_mkuffd_wp(newpte);
 237                         } else if (is_pte_marker_entry(entry)) {
 238                                 /*
 239                                  * Ignore error swap entries unconditionally,
 240                                  * because any access should sigbus anyway.
 241                                  */
 242                                 if (is_poisoned_swp_entry(entry))
 243                                         continue;
 244                                 /*
 245                                  * If this is uffd-wp pte marker and we'd like
 246                                  * to unprotect it, drop it; the next page
 247                                  * fault will trigger without uffd trapping.
 248                                  */
 249                                 if (uffd_wp_resolve) {
 250                                         pte_clear(vma->vm_mm, addr, pte);
 251                                         pages++;
 252                                 }
 253                                 continue;
 254                         } else {
 255                                 newpte = oldpte;
 256                         }
 257
 258                         if (uffd_wp)
 259                                 newpte = pte_swp_mkuffd_wp(newpte);
 260                         else if (uffd_wp_resolve)
 261                                 newpte = pte_swp_clear_uffd_wp(newpte);
 262
 263                         if (!pte_same(oldpte, newpte)) {
 264                                 set_pte_at(vma->vm_mm, addr, pte, newpte);
 265                                 pages++;
 266                         }
 267                 } else {
 268                         /* It must be an none page, or what else?.. */
 269                         WARN_ON_ONCE(!pte_none(oldpte));
 270
 271                         /*
 272                          * Nobody plays with any none ptes besides
 273                          * userfaultfd when applying the protections.
 274                          */
 275                         if (likely(!uffd_wp))
 276                                 continue;
 277
 278                         if (userfaultfd_wp_use_markers(vma)) {
 279                                 /*
 280                                  * For file-backed mem, we need to be able to
 281                                  * wr-protect a none pte, because even if the
 282                                  * pte is none, the page/swap cache could
 283                                  * exist.  Doing that by install a marker.
 284                                  */
 285                                 set_pte_at(vma->vm_mm, addr, pte,
 286                                            make_pte_marker(PTE_MARKER_UFFD_WP));
 287                                 pages++;
 288                         }
 289                 }
 290         } while (pte++, addr += PAGE_SIZE, addr != end);
 291         arch_leave_lazy_mmu_mode();
 292         pte_unmap_unlock(pte - 1, ptl);
 293
 294         return pages;
 295 }
 296
 297 /*
 298  * Return true if we want to split THPs into PTE mappings in change
 299  * protection procedure, false otherwise.
 300  */
 301 static inline bool
 302 pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
 303 {
 304         /*
 305          * pte markers only resides in pte level, if we need pte markers,
 306          * we need to split.  We cannot wr-protect shmem thp because file
 307          * thp is handled differently when split by erasing the pmd so far.
 308          */
 309         return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
 310 }
 311
 312 /*
 313  * Return true if we want to populate pgtables in change protection
 314  * procedure, false otherwise
 315  */
 316 static inline bool
 317 pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
 318 {
 319         /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
 320         if (!(cp_flags & MM_CP_UFFD_WP))
 321                 return false;
 322
 323         /* Populate if the userfaultfd mode requires pte markers */
 324         return userfaultfd_wp_use_markers(vma);
 325 }
 326
 327 /*
 328  * Populate the pgtable underneath for whatever reason if requested.
 329  * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
 330  * allocation failures during page faults by kicking OOM and returning
 331  * error.
 332  */
 333 #define  change_pmd_prepare(vma, pmd, cp_flags)                         \
 334         ({                                                              \
 335                 long err = 0;                                           \
 336                 if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
 337                         if (pte_alloc(vma->vm_mm, pmd))                 \
 338                                 err = -ENOMEM;                          \
 339                 }                                                       \
 340                 err;                                                    \
 341         })
 342
 343 /*
 344  * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
 345  * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
 346  * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
 347  */
 348 #define  change_prepare(vma, high, low, addr, cp_flags)                 \
 349           ({                                                            \
 350                 long err = 0;                                           \
 351                 if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
 352                         low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
 353                         if (p == NULL)                                  \
 354                                 err = -ENOMEM;                          \
 355                 }                                                       \
 356                 err;                                                    \
 357         })
 358
 359 static inline long change_pmd_range(struct mmu_gather *tlb,
 360                 struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
 361                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
 362 {
 363         pmd_t *pmd;
 364         unsigned long next;
 365         long pages = 0;
 366         unsigned long nr_huge_updates = 0;
 367         struct mmu_notifier_range range;
 368
 369         range.start = 0;
 370
 371         pmd = pmd_offset(pud, addr);
 372         do {
 373                 long ret;
 374                 pmd_t _pmd;
 375 again:
 376                 next = pmd_addr_end(addr, end);
 377
 378                 ret = change_pmd_prepare(vma, pmd, cp_flags);
 379                 if (ret) {
 380                         pages = ret;
 381                         break;
 382                 }
 383
 384                 if (pmd_none(*pmd))
 385                         goto next;
 386
 387                 /* invoke the mmu notifier if the pmd is populated */
 388                 if (!range.start) {
 389                         mmu_notifier_range_init(&range,
 390                                 MMU_NOTIFY_PROTECTION_VMA, 0,
 391                                 vma->vm_mm, addr, end);
 392                         mmu_notifier_invalidate_range_start(&range);
 393                 }
 394
 395                 _pmd = pmdp_get_lockless(pmd);
 396                 if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
 397                         if ((next - addr != HPAGE_PMD_SIZE) ||
 398                             pgtable_split_needed(vma, cp_flags)) {
 399                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
 400                                 /*
 401                                  * For file-backed, the pmd could have been
 402                                  * cleared; make sure pmd populated if
 403                                  * necessary, then fall-through to pte level.
 404                                  */
 405                                 ret = change_pmd_prepare(vma, pmd, cp_flags);
 406                                 if (ret) {
 407                                         pages = ret;
 408                                         break;
 409                                 }
 410                         } else {
 411                                 ret = change_huge_pmd(tlb, vma, pmd,
 412                                                 addr, newprot, cp_flags);
 413                                 if (ret) {
 414                                         if (ret == HPAGE_PMD_NR) {
 415                                                 pages += HPAGE_PMD_NR;
 416                                                 nr_huge_updates++;
 417                                         }
 418
 419                                         /* huge pmd was handled */
 420                                         goto next;
 421                                 }
 422                         }
 423                         /* fall through, the trans huge pmd just split */
 424                 }
 425
 426                 ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
 427                                        cp_flags);
 428                 if (ret < 0)
 429                         goto again;
 430                 pages += ret;
 431 next:
 432                 cond_resched();
 433         } while (pmd++, addr = next, addr != end);
 434
 435         if (range.start)
 436                 mmu_notifier_invalidate_range_end(&range);
 437
 438         if (nr_huge_updates)
 439                 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
 440         return pages;
 441 }
 442
 443 static inline long change_pud_range(struct mmu_gather *tlb,
 444                 struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
 445                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
 446 {
 447         pud_t *pud;
 448         unsigned long next;
 449         long pages = 0, ret;
 450
 451         pud = pud_offset(p4d, addr);
 452         do {
 453                 next = pud_addr_end(addr, end);
 454                 ret = change_prepare(vma, pud, pmd, addr, cp_flags);
 455                 if (ret)
 456                         return ret;
 457                 if (pud_none_or_clear_bad(pud))
 458                         continue;
 459                 pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
 460                                           cp_flags);
 461         } while (pud++, addr = next, addr != end);
 462
 463         return pages;
 464 }
 465
 466 static inline long change_p4d_range(struct mmu_gather *tlb,
 467                 struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
 468                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
 469 {
 470         p4d_t *p4d;
 471         unsigned long next;
 472         long pages = 0, ret;
 473
 474         p4d = p4d_offset(pgd, addr);
 475         do {
 476                 next = p4d_addr_end(addr, end);
 477                 ret = change_prepare(vma, p4d, pud, addr, cp_flags);
 478                 if (ret)
 479                         return ret;
 480                 if (p4d_none_or_clear_bad(p4d))
 481                         continue;
 482                 pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
 483                                           cp_flags);
 484         } while (p4d++, addr = next, addr != end);
 485
 486         return pages;
 487 }
 488
 489 static long change_protection_range(struct mmu_gather *tlb,
 490                 struct vm_area_struct *vma, unsigned long addr,
 491                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
 492 {
 493         struct mm_struct *mm = vma->vm_mm;
 494         pgd_t *pgd;
 495         unsigned long next;
 496         long pages = 0, ret;
 497
 498         BUG_ON(addr >= end);
 499         pgd = pgd_offset(mm, addr);
 500         tlb_start_vma(tlb, vma);
 501         do {
 502                 next = pgd_addr_end(addr, end);
 503                 ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
 504                 if (ret) {
 505                         pages = ret;
 506                         break;
 507                 }
 508                 if (pgd_none_or_clear_bad(pgd))
 509                         continue;
 510                 pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
 511                                           cp_flags);
 512         } while (pgd++, addr = next, addr != end);
 513
 514         tlb_end_vma(tlb, vma);
 515
 516         return pages;
 517 }
 518
 519 long change_protection(struct mmu_gather *tlb,
 520                        struct vm_area_struct *vma, unsigned long start,
 521                        unsigned long end, unsigned long cp_flags)
 522 {
 523         pgprot_t newprot = vma->vm_page_prot;
 524         long pages;
 525
 526         BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
 527
 528 #ifdef CONFIG_NUMA_BALANCING
 529         /*
 530          * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
 531          * are expected to reflect their requirements via VMA flags such that
 532          * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
 533          */
 534         if (cp_flags & MM_CP_PROT_NUMA)
 535                 newprot = PAGE_NONE;
 536 #else
 537         WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
 538 #endif
 539
 540         if (is_vm_hugetlb_page(vma))
 541                 pages = hugetlb_change_protection(vma, start, end, newprot,
 542                                                   cp_flags);
 543         else
 544                 pages = change_protection_range(tlb, vma, start, end, newprot,
 545                                                 cp_flags);
 546
 547         return pages;
 548 }
 549
 550 static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
 551                                unsigned long next, struct mm_walk *walk)
 552 {
 553         return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
 554                                   *(pgprot_t *)(walk->private)) ?
 555                 0 : -EACCES;
 556 }
 557
 558 static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
 559                                    unsigned long addr, unsigned long next,
 560                                    struct mm_walk *walk)
 561 {
 562         return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
 563                                   *(pgprot_t *)(walk->private)) ?
 564                 0 : -EACCES;
 565 }
 566
 567 static int prot_none_test(unsigned long addr, unsigned long next,
 568                           struct mm_walk *walk)
 569 {
 570         return 0;
 571 }
 572
 573 static const struct mm_walk_ops prot_none_walk_ops = {
 574         .pte_entry              = prot_none_pte_entry,
 575         .hugetlb_entry          = prot_none_hugetlb_entry,
 576         .test_walk              = prot_none_test,
 577         .walk_lock              = PGWALK_WRLOCK,
 578 };
 579
 580 int
 581 mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
 582                struct vm_area_struct *vma, struct vm_area_struct **pprev,
 583                unsigned long start, unsigned long end, unsigned long newflags)
 584 {
 585         struct mm_struct *mm = vma->vm_mm;
 586         unsigned long oldflags = vma->vm_flags;
 587         long nrpages = (end - start) >> PAGE_SHIFT;
 588         unsigned int mm_cp_flags = 0;
 589         unsigned long charged = 0;
 590         int error;
 591
 592         if (newflags == oldflags) {
 593                 *pprev = vma;
 594                 return 0;
 595         }
 596
 597         /*
 598          * Do PROT_NONE PFN permission checks here when we can still
 599          * bail out without undoing a lot of state. This is a rather
 600          * uncommon case, so doesn't need to be very optimized.
 601          */
 602         if (arch_has_pfn_modify_check() &&
 603             (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
 604             (newflags & VM_ACCESS_FLAGS) == 0) {
 605                 pgprot_t new_pgprot = vm_get_page_prot(newflags);
 606
 607                 error = walk_page_range(current->mm, start, end,
 608                                 &prot_none_walk_ops, &new_pgprot);
 609                 if (error)
 610                         return error;
 611         }
 612
 613         /*
 614          * If we make a private mapping writable we increase our commit;
 615          * but (without finer accounting) cannot reduce our commit if we
 616          * make it unwritable again except in the anonymous case where no
 617          * anon_vma has yet to be assigned.
 618          *
 619          * hugetlb mapping were accounted for even if read-only so there is
 620          * no need to account for them here.
 621          */
 622         if (newflags & VM_WRITE) {
 623                 /* Check space limits when area turns into data. */
 624                 if (!may_expand_vm(mm, newflags, nrpages) &&
 625                                 may_expand_vm(mm, oldflags, nrpages))
 626                         return -ENOMEM;
 627                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 628                                                 VM_SHARED|VM_NORESERVE))) {
 629                         charged = nrpages;
 630                         if (security_vm_enough_memory_mm(mm, charged))
 631                                 return -ENOMEM;
 632                         newflags |= VM_ACCOUNT;
 633                 }
 634         } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) &&
 635                    !vma->anon_vma) {
 636                 newflags &= ~VM_ACCOUNT;
 637         }
 638
 639         vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
 640         if (IS_ERR(vma)) {
 641                 error = PTR_ERR(vma);
 642                 goto fail;
 643         }
 644
 645         *pprev = vma;
 646
 647         /*
 648          * vm_flags and vm_page_prot are protected by the mmap_lock
 649          * held in write mode.
 650          */
 651         vma_start_write(vma);
 652         vm_flags_reset(vma, newflags);
 653         if (vma_wants_manual_pte_write_upgrade(vma))
 654                 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
 655         vma_set_page_prot(vma);
 656
 657         change_protection(tlb, vma, start, end, mm_cp_flags);
 658
 659         if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT))
 660                 vm_unacct_memory(nrpages);
 661
 662         /*
 663          * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
 664          * fault on access.
 665          */
 666         if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
 667                         (newflags & VM_WRITE)) {
 668                 populate_vma_page_range(vma, start, end, NULL);
 669         }
 670
 671         vm_stat_account(mm, oldflags, -nrpages);
 672         vm_stat_account(mm, newflags, nrpages);
 673         perf_event_mmap(vma);
 674         return 0;
 675
 676 fail:
 677         vm_unacct_memory(charged);
 678         return error;
 679 }
 680
 681 /*
 682  * pkey==-1 when doing a legacy mprotect()
 683  */
 684 static int do_mprotect_pkey(unsigned long start, size_t len,
 685                 unsigned long prot, int pkey)
 686 {
 687         unsigned long nstart, end, tmp, reqprot;
 688         struct vm_area_struct *vma, *prev;
 689         int error;
 690         const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
 691         const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
 692                                 (prot & PROT_READ);
 693         struct mmu_gather tlb;
 694         struct vma_iterator vmi;
 695
 696         start = untagged_addr(start);
 697
 698         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
 699         if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
 700                 return -EINVAL;
 701
 702         if (start & ~PAGE_MASK)
 703                 return -EINVAL;
 704         if (!len)
 705                 return 0;
 706         len = PAGE_ALIGN(len);
 707         end = start + len;
 708         if (end <= start)
 709                 return -ENOMEM;
 710         if (!arch_validate_prot(prot, start))
 711                 return -EINVAL;
 712
 713         reqprot = prot;
 714
 715         if (mmap_write_lock_killable(current->mm))
 716                 return -EINTR;
 717
 718         /*
 719          * If userspace did not allocate the pkey, do not let
 720          * them use it here.
 721          */
 722         error = -EINVAL;
 723         if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
 724                 goto out;
 725
 726         vma_iter_init(&vmi, current->mm, start);
 727         vma = vma_find(&vmi, end);
 728         error = -ENOMEM;
 729         if (!vma)
 730                 goto out;
 731
 732         if (unlikely(grows & PROT_GROWSDOWN)) {
 733                 if (vma->vm_start >= end)
 734                         goto out;
 735                 start = vma->vm_start;
 736                 error = -EINVAL;
 737                 if (!(vma->vm_flags & VM_GROWSDOWN))
 738                         goto out;
 739         } else {
 740                 if (vma->vm_start > start)
 741                         goto out;
 742                 if (unlikely(grows & PROT_GROWSUP)) {
 743                         end = vma->vm_end;
 744                         error = -EINVAL;
 745                         if (!(vma->vm_flags & VM_GROWSUP))
 746                                 goto out;
 747                 }
 748         }
 749
 750         /*
 751          * checking if memory is sealed.
 752          * can_modify_mm assumes we have acquired the lock on MM.
 753          */
 754         if (unlikely(!can_modify_mm(current->mm, start, end))) {
 755                 error = -EPERM;
 756                 goto out;
 757         }
 758
 759         prev = vma_prev(&vmi);
 760         if (start > vma->vm_start)
 761                 prev = vma;
 762
 763         tlb_gather_mmu(&tlb, current->mm);
 764         nstart = start;
 765         tmp = vma->vm_start;
 766         for_each_vma_range(vmi, vma, end) {
 767                 unsigned long mask_off_old_flags;
 768                 unsigned long newflags;
 769                 int new_vma_pkey;
 770
 771                 if (vma->vm_start != tmp) {
 772                         error = -ENOMEM;
 773                         break;
 774                 }
 775
 776                 /* Does the application expect PROT_READ to imply PROT_EXEC */
 777                 if (rier && (vma->vm_flags & VM_MAYEXEC))
 778                         prot |= PROT_EXEC;
 779
 780                 /*
 781                  * Each mprotect() call explicitly passes r/w/x permissions.
 782                  * If a permission is not passed to mprotect(), it must be
 783                  * cleared from the VMA.
 784                  */
 785                 mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
 786
 787                 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
 788                 newflags = calc_vm_prot_bits(prot, new_vma_pkey);
 789                 newflags |= (vma->vm_flags & ~mask_off_old_flags);
 790
 791                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
 792                 if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
 793                         error = -EACCES;
 794                         break;
 795                 }
 796
 797                 if (map_deny_write_exec(vma, newflags)) {
 798                         error = -EACCES;
 799                         break;
 800                 }
 801
 802                 /* Allow architectures to sanity-check the new flags */
 803                 if (!arch_validate_flags(newflags)) {
 804                         error = -EINVAL;
 805                         break;
 806                 }
 807
 808                 error = security_file_mprotect(vma, reqprot, prot);
 809                 if (error)
 810                         break;
 811
 812                 tmp = vma->vm_end;
 813                 if (tmp > end)
 814                         tmp = end;
 815
 816                 if (vma->vm_ops && vma->vm_ops->mprotect) {
 817                         error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
 818                         if (error)
 819                                 break;
 820                 }
 821
 822                 error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
 823                 if (error)
 824                         break;
 825
 826                 tmp = vma_iter_end(&vmi);
 827                 nstart = tmp;
 828                 prot = reqprot;
 829         }
 830         tlb_finish_mmu(&tlb);
 831
 832         if (!error && tmp < end)
 833                 error = -ENOMEM;
 834
 835 out:
 836         mmap_write_unlock(current->mm);
 837         return error;
 838 }
 839
 840 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 841                 unsigned long, prot)
 842 {
 843         return do_mprotect_pkey(start, len, prot, -1);
 844 }
 845
 846 #ifdef CONFIG_ARCH_HAS_PKEYS
 847
 848 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
 849                 unsigned long, prot, int, pkey)
 850 {
 851         return do_mprotect_pkey(start, len, prot, pkey);
 852 }
 853
 854 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
 855 {
 856         int pkey;
 857         int ret;
 858
 859         /* No flags supported yet. */
 860         if (flags)
 861                 return -EINVAL;
 862         /* check for unsupported init values */
 863         if (init_val & ~PKEY_ACCESS_MASK)
 864                 return -EINVAL;
 865
 866         mmap_write_lock(current->mm);
 867         pkey = mm_pkey_alloc(current->mm);
 868
 869         ret = -ENOSPC;
 870         if (pkey == -1)
 871                 goto out;
 872
 873         ret = arch_set_user_pkey_access(current, pkey, init_val);
 874         if (ret) {
 875                 mm_pkey_free(current->mm, pkey);
 876                 goto out;
 877         }
 878         ret = pkey;
 879 out:
 880         mmap_write_unlock(current->mm);
 881         return ret;
 882 }
 883
 884 SYSCALL_DEFINE1(pkey_free, int, pkey)
 885 {
 886         int ret;
 887
 888         mmap_write_lock(current->mm);
 889         ret = mm_pkey_free(current->mm, pkey);
 890         mmap_write_unlock(current->mm);
 891
 892         /*
 893          * We could provide warnings or errors if any VMA still
 894          * has the pkey set here.
 895          */
 896         return ret;
 897 }
 898
 899 #endif /* CONFIG_ARCH_HAS_PKEYS */