1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * mm/mprotect.c
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * (C) Copyright 1994 Linus Torvalds
6*4882a593Smuzhiyun * (C) Copyright 2002 Christoph Hellwig
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Address space accounting code <alan@lxorguk.ukuu.org.uk>
9*4882a593Smuzhiyun * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun #include <linux/pagewalk.h>
13*4882a593Smuzhiyun #include <linux/hugetlb.h>
14*4882a593Smuzhiyun #include <linux/shm.h>
15*4882a593Smuzhiyun #include <linux/mman.h>
16*4882a593Smuzhiyun #include <linux/fs.h>
17*4882a593Smuzhiyun #include <linux/highmem.h>
18*4882a593Smuzhiyun #include <linux/security.h>
19*4882a593Smuzhiyun #include <linux/mempolicy.h>
20*4882a593Smuzhiyun #include <linux/personality.h>
21*4882a593Smuzhiyun #include <linux/syscalls.h>
22*4882a593Smuzhiyun #include <linux/swap.h>
23*4882a593Smuzhiyun #include <linux/swapops.h>
24*4882a593Smuzhiyun #include <linux/mmu_notifier.h>
25*4882a593Smuzhiyun #include <linux/migrate.h>
26*4882a593Smuzhiyun #include <linux/perf_event.h>
27*4882a593Smuzhiyun #include <linux/pkeys.h>
28*4882a593Smuzhiyun #include <linux/ksm.h>
29*4882a593Smuzhiyun #include <linux/uaccess.h>
30*4882a593Smuzhiyun #include <linux/mm_inline.h>
31*4882a593Smuzhiyun #include <linux/pgtable.h>
32*4882a593Smuzhiyun #include <asm/cacheflush.h>
33*4882a593Smuzhiyun #include <asm/mmu_context.h>
34*4882a593Smuzhiyun #include <asm/tlbflush.h>
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun #include "internal.h"
37*4882a593Smuzhiyun
change_pte_range(struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,unsigned long end,pgprot_t newprot,unsigned long cp_flags)38*4882a593Smuzhiyun static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39*4882a593Smuzhiyun unsigned long addr, unsigned long end, pgprot_t newprot,
40*4882a593Smuzhiyun unsigned long cp_flags)
41*4882a593Smuzhiyun {
42*4882a593Smuzhiyun pte_t *pte, oldpte;
43*4882a593Smuzhiyun spinlock_t *ptl;
44*4882a593Smuzhiyun unsigned long pages = 0;
45*4882a593Smuzhiyun int target_node = NUMA_NO_NODE;
46*4882a593Smuzhiyun bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
47*4882a593Smuzhiyun bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
48*4882a593Smuzhiyun bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
49*4882a593Smuzhiyun bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun /*
52*4882a593Smuzhiyun * Can be called with only the mmap_lock for reading by
53*4882a593Smuzhiyun * prot_numa so we must check the pmd isn't constantly
54*4882a593Smuzhiyun * changing from under us from pmd_none to pmd_trans_huge
55*4882a593Smuzhiyun * and/or the other way around.
56*4882a593Smuzhiyun */
57*4882a593Smuzhiyun if (pmd_trans_unstable(pmd))
58*4882a593Smuzhiyun return 0;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun /*
61*4882a593Smuzhiyun * The pmd points to a regular pte so the pmd can't change
62*4882a593Smuzhiyun * from under us even if the mmap_lock is only hold for
63*4882a593Smuzhiyun * reading.
64*4882a593Smuzhiyun */
65*4882a593Smuzhiyun pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun /* Get target node for single threaded private VMAs */
68*4882a593Smuzhiyun if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
69*4882a593Smuzhiyun atomic_read(&vma->vm_mm->mm_users) == 1)
70*4882a593Smuzhiyun target_node = numa_node_id();
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun flush_tlb_batched_pending(vma->vm_mm);
73*4882a593Smuzhiyun arch_enter_lazy_mmu_mode();
74*4882a593Smuzhiyun do {
75*4882a593Smuzhiyun oldpte = *pte;
76*4882a593Smuzhiyun if (pte_present(oldpte)) {
77*4882a593Smuzhiyun pte_t ptent;
78*4882a593Smuzhiyun bool preserve_write = prot_numa && pte_write(oldpte);
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun /*
81*4882a593Smuzhiyun * Avoid trapping faults against the zero or KSM
82*4882a593Smuzhiyun * pages. See similar comment in change_huge_pmd.
83*4882a593Smuzhiyun */
84*4882a593Smuzhiyun if (prot_numa) {
85*4882a593Smuzhiyun struct page *page;
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun /* Avoid TLB flush if possible */
88*4882a593Smuzhiyun if (pte_protnone(oldpte))
89*4882a593Smuzhiyun continue;
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun page = vm_normal_page(vma, addr, oldpte);
92*4882a593Smuzhiyun if (!page || PageKsm(page))
93*4882a593Smuzhiyun continue;
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun /* Also skip shared copy-on-write pages */
96*4882a593Smuzhiyun if (is_cow_mapping(vma->vm_flags) &&
97*4882a593Smuzhiyun page_count(page) != 1)
98*4882a593Smuzhiyun continue;
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun /*
101*4882a593Smuzhiyun * While migration can move some dirty pages,
102*4882a593Smuzhiyun * it cannot move them all from MIGRATE_ASYNC
103*4882a593Smuzhiyun * context.
104*4882a593Smuzhiyun */
105*4882a593Smuzhiyun if (page_is_file_lru(page) && PageDirty(page))
106*4882a593Smuzhiyun continue;
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun /*
109*4882a593Smuzhiyun * Don't mess with PTEs if page is already on the node
110*4882a593Smuzhiyun * a single-threaded process is running on.
111*4882a593Smuzhiyun */
112*4882a593Smuzhiyun if (target_node == page_to_nid(page))
113*4882a593Smuzhiyun continue;
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun oldpte = ptep_modify_prot_start(vma, addr, pte);
117*4882a593Smuzhiyun ptent = pte_modify(oldpte, newprot);
118*4882a593Smuzhiyun if (preserve_write)
119*4882a593Smuzhiyun ptent = pte_mk_savedwrite(ptent);
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun if (uffd_wp) {
122*4882a593Smuzhiyun ptent = pte_wrprotect(ptent);
123*4882a593Smuzhiyun ptent = pte_mkuffd_wp(ptent);
124*4882a593Smuzhiyun } else if (uffd_wp_resolve) {
125*4882a593Smuzhiyun /*
126*4882a593Smuzhiyun * Leave the write bit to be handled
127*4882a593Smuzhiyun * by PF interrupt handler, then
128*4882a593Smuzhiyun * things like COW could be properly
129*4882a593Smuzhiyun * handled.
130*4882a593Smuzhiyun */
131*4882a593Smuzhiyun ptent = pte_clear_uffd_wp(ptent);
132*4882a593Smuzhiyun }
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun /* Avoid taking write faults for known dirty pages */
135*4882a593Smuzhiyun if (dirty_accountable && pte_dirty(ptent) &&
136*4882a593Smuzhiyun (pte_soft_dirty(ptent) ||
137*4882a593Smuzhiyun !(vma->vm_flags & VM_SOFTDIRTY))) {
138*4882a593Smuzhiyun ptent = pte_mkwrite(ptent);
139*4882a593Smuzhiyun }
140*4882a593Smuzhiyun ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
141*4882a593Smuzhiyun pages++;
142*4882a593Smuzhiyun } else if (is_swap_pte(oldpte)) {
143*4882a593Smuzhiyun swp_entry_t entry = pte_to_swp_entry(oldpte);
144*4882a593Smuzhiyun pte_t newpte;
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun if (is_write_migration_entry(entry)) {
147*4882a593Smuzhiyun /*
148*4882a593Smuzhiyun * A protection check is difficult so
149*4882a593Smuzhiyun * just be safe and disable write
150*4882a593Smuzhiyun */
151*4882a593Smuzhiyun make_migration_entry_read(&entry);
152*4882a593Smuzhiyun newpte = swp_entry_to_pte(entry);
153*4882a593Smuzhiyun if (pte_swp_soft_dirty(oldpte))
154*4882a593Smuzhiyun newpte = pte_swp_mksoft_dirty(newpte);
155*4882a593Smuzhiyun if (pte_swp_uffd_wp(oldpte))
156*4882a593Smuzhiyun newpte = pte_swp_mkuffd_wp(newpte);
157*4882a593Smuzhiyun } else if (is_write_device_private_entry(entry)) {
158*4882a593Smuzhiyun /*
159*4882a593Smuzhiyun * We do not preserve soft-dirtiness. See
160*4882a593Smuzhiyun * copy_one_pte() for explanation.
161*4882a593Smuzhiyun */
162*4882a593Smuzhiyun make_device_private_entry_read(&entry);
163*4882a593Smuzhiyun newpte = swp_entry_to_pte(entry);
164*4882a593Smuzhiyun if (pte_swp_uffd_wp(oldpte))
165*4882a593Smuzhiyun newpte = pte_swp_mkuffd_wp(newpte);
166*4882a593Smuzhiyun } else {
167*4882a593Smuzhiyun newpte = oldpte;
168*4882a593Smuzhiyun }
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun if (uffd_wp)
171*4882a593Smuzhiyun newpte = pte_swp_mkuffd_wp(newpte);
172*4882a593Smuzhiyun else if (uffd_wp_resolve)
173*4882a593Smuzhiyun newpte = pte_swp_clear_uffd_wp(newpte);
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun if (!pte_same(oldpte, newpte)) {
176*4882a593Smuzhiyun set_pte_at(vma->vm_mm, addr, pte, newpte);
177*4882a593Smuzhiyun pages++;
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun }
180*4882a593Smuzhiyun } while (pte++, addr += PAGE_SIZE, addr != end);
181*4882a593Smuzhiyun arch_leave_lazy_mmu_mode();
182*4882a593Smuzhiyun pte_unmap_unlock(pte - 1, ptl);
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun return pages;
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun /*
188*4882a593Smuzhiyun * Used when setting automatic NUMA hinting protection where it is
189*4882a593Smuzhiyun * critical that a numa hinting PMD is not confused with a bad PMD.
190*4882a593Smuzhiyun */
pmd_none_or_clear_bad_unless_trans_huge(pmd_t * pmd)191*4882a593Smuzhiyun static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
192*4882a593Smuzhiyun {
193*4882a593Smuzhiyun pmd_t pmdval = pmd_read_atomic(pmd);
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
196*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
197*4882a593Smuzhiyun barrier();
198*4882a593Smuzhiyun #endif
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun if (pmd_none(pmdval))
201*4882a593Smuzhiyun return 1;
202*4882a593Smuzhiyun if (pmd_trans_huge(pmdval))
203*4882a593Smuzhiyun return 0;
204*4882a593Smuzhiyun if (unlikely(pmd_bad(pmdval))) {
205*4882a593Smuzhiyun pmd_clear_bad(pmd);
206*4882a593Smuzhiyun return 1;
207*4882a593Smuzhiyun }
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun return 0;
210*4882a593Smuzhiyun }
211*4882a593Smuzhiyun
change_pmd_range(struct vm_area_struct * vma,pud_t * pud,unsigned long addr,unsigned long end,pgprot_t newprot,unsigned long cp_flags)212*4882a593Smuzhiyun static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
213*4882a593Smuzhiyun pud_t *pud, unsigned long addr, unsigned long end,
214*4882a593Smuzhiyun pgprot_t newprot, unsigned long cp_flags)
215*4882a593Smuzhiyun {
216*4882a593Smuzhiyun pmd_t *pmd;
217*4882a593Smuzhiyun unsigned long next;
218*4882a593Smuzhiyun unsigned long pages = 0;
219*4882a593Smuzhiyun unsigned long nr_huge_updates = 0;
220*4882a593Smuzhiyun struct mmu_notifier_range range;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun range.start = 0;
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun pmd = pmd_offset(pud, addr);
225*4882a593Smuzhiyun do {
226*4882a593Smuzhiyun unsigned long this_pages;
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun next = pmd_addr_end(addr, end);
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun /*
231*4882a593Smuzhiyun * Automatic NUMA balancing walks the tables with mmap_lock
232*4882a593Smuzhiyun * held for read. It's possible a parallel update to occur
233*4882a593Smuzhiyun * between pmd_trans_huge() and a pmd_none_or_clear_bad()
234*4882a593Smuzhiyun * check leading to a false positive and clearing.
235*4882a593Smuzhiyun * Hence, it's necessary to atomically read the PMD value
236*4882a593Smuzhiyun * for all the checks.
237*4882a593Smuzhiyun */
238*4882a593Smuzhiyun if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
239*4882a593Smuzhiyun pmd_none_or_clear_bad_unless_trans_huge(pmd))
240*4882a593Smuzhiyun goto next;
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun /* invoke the mmu notifier if the pmd is populated */
243*4882a593Smuzhiyun if (!range.start) {
244*4882a593Smuzhiyun mmu_notifier_range_init(&range,
245*4882a593Smuzhiyun MMU_NOTIFY_PROTECTION_VMA, 0,
246*4882a593Smuzhiyun vma, vma->vm_mm, addr, end);
247*4882a593Smuzhiyun mmu_notifier_invalidate_range_start(&range);
248*4882a593Smuzhiyun }
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
251*4882a593Smuzhiyun if (next - addr != HPAGE_PMD_SIZE) {
252*4882a593Smuzhiyun __split_huge_pmd(vma, pmd, addr, false, NULL);
253*4882a593Smuzhiyun } else {
254*4882a593Smuzhiyun int nr_ptes = change_huge_pmd(vma, pmd, addr,
255*4882a593Smuzhiyun newprot, cp_flags);
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun if (nr_ptes) {
258*4882a593Smuzhiyun if (nr_ptes == HPAGE_PMD_NR) {
259*4882a593Smuzhiyun pages += HPAGE_PMD_NR;
260*4882a593Smuzhiyun nr_huge_updates++;
261*4882a593Smuzhiyun }
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun /* huge pmd was handled */
264*4882a593Smuzhiyun goto next;
265*4882a593Smuzhiyun }
266*4882a593Smuzhiyun }
267*4882a593Smuzhiyun /* fall through, the trans huge pmd just split */
268*4882a593Smuzhiyun }
269*4882a593Smuzhiyun this_pages = change_pte_range(vma, pmd, addr, next, newprot,
270*4882a593Smuzhiyun cp_flags);
271*4882a593Smuzhiyun pages += this_pages;
272*4882a593Smuzhiyun next:
273*4882a593Smuzhiyun cond_resched();
274*4882a593Smuzhiyun } while (pmd++, addr = next, addr != end);
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun if (range.start)
277*4882a593Smuzhiyun mmu_notifier_invalidate_range_end(&range);
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun if (nr_huge_updates)
280*4882a593Smuzhiyun count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
281*4882a593Smuzhiyun return pages;
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun
change_pud_range(struct vm_area_struct * vma,p4d_t * p4d,unsigned long addr,unsigned long end,pgprot_t newprot,unsigned long cp_flags)284*4882a593Smuzhiyun static inline unsigned long change_pud_range(struct vm_area_struct *vma,
285*4882a593Smuzhiyun p4d_t *p4d, unsigned long addr, unsigned long end,
286*4882a593Smuzhiyun pgprot_t newprot, unsigned long cp_flags)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun pud_t *pud;
289*4882a593Smuzhiyun unsigned long next;
290*4882a593Smuzhiyun unsigned long pages = 0;
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun pud = pud_offset(p4d, addr);
293*4882a593Smuzhiyun do {
294*4882a593Smuzhiyun next = pud_addr_end(addr, end);
295*4882a593Smuzhiyun if (pud_none_or_clear_bad(pud))
296*4882a593Smuzhiyun continue;
297*4882a593Smuzhiyun pages += change_pmd_range(vma, pud, addr, next, newprot,
298*4882a593Smuzhiyun cp_flags);
299*4882a593Smuzhiyun } while (pud++, addr = next, addr != end);
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun return pages;
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun
change_p4d_range(struct vm_area_struct * vma,pgd_t * pgd,unsigned long addr,unsigned long end,pgprot_t newprot,unsigned long cp_flags)304*4882a593Smuzhiyun static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
305*4882a593Smuzhiyun pgd_t *pgd, unsigned long addr, unsigned long end,
306*4882a593Smuzhiyun pgprot_t newprot, unsigned long cp_flags)
307*4882a593Smuzhiyun {
308*4882a593Smuzhiyun p4d_t *p4d;
309*4882a593Smuzhiyun unsigned long next;
310*4882a593Smuzhiyun unsigned long pages = 0;
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun p4d = p4d_offset(pgd, addr);
313*4882a593Smuzhiyun do {
314*4882a593Smuzhiyun next = p4d_addr_end(addr, end);
315*4882a593Smuzhiyun if (p4d_none_or_clear_bad(p4d))
316*4882a593Smuzhiyun continue;
317*4882a593Smuzhiyun pages += change_pud_range(vma, p4d, addr, next, newprot,
318*4882a593Smuzhiyun cp_flags);
319*4882a593Smuzhiyun } while (p4d++, addr = next, addr != end);
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun return pages;
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun
change_protection_range(struct vm_area_struct * vma,unsigned long addr,unsigned long end,pgprot_t newprot,unsigned long cp_flags)324*4882a593Smuzhiyun static unsigned long change_protection_range(struct vm_area_struct *vma,
325*4882a593Smuzhiyun unsigned long addr, unsigned long end, pgprot_t newprot,
326*4882a593Smuzhiyun unsigned long cp_flags)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun struct mm_struct *mm = vma->vm_mm;
329*4882a593Smuzhiyun pgd_t *pgd;
330*4882a593Smuzhiyun unsigned long next;
331*4882a593Smuzhiyun unsigned long start = addr;
332*4882a593Smuzhiyun unsigned long pages = 0;
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun BUG_ON(addr >= end);
335*4882a593Smuzhiyun pgd = pgd_offset(mm, addr);
336*4882a593Smuzhiyun flush_cache_range(vma, addr, end);
337*4882a593Smuzhiyun inc_tlb_flush_pending(mm);
338*4882a593Smuzhiyun do {
339*4882a593Smuzhiyun next = pgd_addr_end(addr, end);
340*4882a593Smuzhiyun if (pgd_none_or_clear_bad(pgd))
341*4882a593Smuzhiyun continue;
342*4882a593Smuzhiyun pages += change_p4d_range(vma, pgd, addr, next, newprot,
343*4882a593Smuzhiyun cp_flags);
344*4882a593Smuzhiyun } while (pgd++, addr = next, addr != end);
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun /* Only flush the TLB if we actually modified any entries: */
347*4882a593Smuzhiyun if (pages)
348*4882a593Smuzhiyun flush_tlb_range(vma, start, end);
349*4882a593Smuzhiyun dec_tlb_flush_pending(mm);
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun return pages;
352*4882a593Smuzhiyun }
353*4882a593Smuzhiyun
change_protection(struct vm_area_struct * vma,unsigned long start,unsigned long end,pgprot_t newprot,unsigned long cp_flags)354*4882a593Smuzhiyun unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
355*4882a593Smuzhiyun unsigned long end, pgprot_t newprot,
356*4882a593Smuzhiyun unsigned long cp_flags)
357*4882a593Smuzhiyun {
358*4882a593Smuzhiyun unsigned long pages;
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun if (is_vm_hugetlb_page(vma))
363*4882a593Smuzhiyun pages = hugetlb_change_protection(vma, start, end, newprot);
364*4882a593Smuzhiyun else
365*4882a593Smuzhiyun pages = change_protection_range(vma, start, end, newprot,
366*4882a593Smuzhiyun cp_flags);
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun return pages;
369*4882a593Smuzhiyun }
370*4882a593Smuzhiyun
prot_none_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)371*4882a593Smuzhiyun static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
372*4882a593Smuzhiyun unsigned long next, struct mm_walk *walk)
373*4882a593Smuzhiyun {
374*4882a593Smuzhiyun return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
375*4882a593Smuzhiyun 0 : -EACCES;
376*4882a593Smuzhiyun }
377*4882a593Smuzhiyun
prot_none_hugetlb_entry(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long next,struct mm_walk * walk)378*4882a593Smuzhiyun static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
379*4882a593Smuzhiyun unsigned long addr, unsigned long next,
380*4882a593Smuzhiyun struct mm_walk *walk)
381*4882a593Smuzhiyun {
382*4882a593Smuzhiyun return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
383*4882a593Smuzhiyun 0 : -EACCES;
384*4882a593Smuzhiyun }
385*4882a593Smuzhiyun
prot_none_test(unsigned long addr,unsigned long next,struct mm_walk * walk)386*4882a593Smuzhiyun static int prot_none_test(unsigned long addr, unsigned long next,
387*4882a593Smuzhiyun struct mm_walk *walk)
388*4882a593Smuzhiyun {
389*4882a593Smuzhiyun return 0;
390*4882a593Smuzhiyun }
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun static const struct mm_walk_ops prot_none_walk_ops = {
393*4882a593Smuzhiyun .pte_entry = prot_none_pte_entry,
394*4882a593Smuzhiyun .hugetlb_entry = prot_none_hugetlb_entry,
395*4882a593Smuzhiyun .test_walk = prot_none_test,
396*4882a593Smuzhiyun };
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun int
mprotect_fixup(struct vm_area_struct * vma,struct vm_area_struct ** pprev,unsigned long start,unsigned long end,unsigned long newflags)399*4882a593Smuzhiyun mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
400*4882a593Smuzhiyun unsigned long start, unsigned long end, unsigned long newflags)
401*4882a593Smuzhiyun {
402*4882a593Smuzhiyun struct mm_struct *mm = vma->vm_mm;
403*4882a593Smuzhiyun unsigned long oldflags = vma->vm_flags;
404*4882a593Smuzhiyun long nrpages = (end - start) >> PAGE_SHIFT;
405*4882a593Smuzhiyun unsigned long charged = 0;
406*4882a593Smuzhiyun pgoff_t pgoff;
407*4882a593Smuzhiyun int error;
408*4882a593Smuzhiyun int dirty_accountable = 0;
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun if (newflags == oldflags) {
411*4882a593Smuzhiyun *pprev = vma;
412*4882a593Smuzhiyun return 0;
413*4882a593Smuzhiyun }
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun /*
416*4882a593Smuzhiyun * Do PROT_NONE PFN permission checks here when we can still
417*4882a593Smuzhiyun * bail out without undoing a lot of state. This is a rather
418*4882a593Smuzhiyun * uncommon case, so doesn't need to be very optimized.
419*4882a593Smuzhiyun */
420*4882a593Smuzhiyun if (arch_has_pfn_modify_check() &&
421*4882a593Smuzhiyun (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
422*4882a593Smuzhiyun (newflags & VM_ACCESS_FLAGS) == 0) {
423*4882a593Smuzhiyun pgprot_t new_pgprot = vm_get_page_prot(newflags);
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun error = walk_page_range(current->mm, start, end,
426*4882a593Smuzhiyun &prot_none_walk_ops, &new_pgprot);
427*4882a593Smuzhiyun if (error)
428*4882a593Smuzhiyun return error;
429*4882a593Smuzhiyun }
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun /*
432*4882a593Smuzhiyun * If we make a private mapping writable we increase our commit;
433*4882a593Smuzhiyun * but (without finer accounting) cannot reduce our commit if we
434*4882a593Smuzhiyun * make it unwritable again. hugetlb mapping were accounted for
435*4882a593Smuzhiyun * even if read-only so there is no need to account for them here
436*4882a593Smuzhiyun */
437*4882a593Smuzhiyun if (newflags & VM_WRITE) {
438*4882a593Smuzhiyun /* Check space limits when area turns into data. */
439*4882a593Smuzhiyun if (!may_expand_vm(mm, newflags, nrpages) &&
440*4882a593Smuzhiyun may_expand_vm(mm, oldflags, nrpages))
441*4882a593Smuzhiyun return -ENOMEM;
442*4882a593Smuzhiyun if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
443*4882a593Smuzhiyun VM_SHARED|VM_NORESERVE))) {
444*4882a593Smuzhiyun charged = nrpages;
445*4882a593Smuzhiyun if (security_vm_enough_memory_mm(mm, charged))
446*4882a593Smuzhiyun return -ENOMEM;
447*4882a593Smuzhiyun newflags |= VM_ACCOUNT;
448*4882a593Smuzhiyun }
449*4882a593Smuzhiyun }
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun /*
452*4882a593Smuzhiyun * First try to merge with previous and/or next vma.
453*4882a593Smuzhiyun */
454*4882a593Smuzhiyun pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
455*4882a593Smuzhiyun *pprev = vma_merge(mm, *pprev, start, end, newflags,
456*4882a593Smuzhiyun vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
457*4882a593Smuzhiyun vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
458*4882a593Smuzhiyun if (*pprev) {
459*4882a593Smuzhiyun vma = *pprev;
460*4882a593Smuzhiyun VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
461*4882a593Smuzhiyun goto success;
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun *pprev = vma;
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun if (start != vma->vm_start) {
467*4882a593Smuzhiyun error = split_vma(mm, vma, start, 1);
468*4882a593Smuzhiyun if (error)
469*4882a593Smuzhiyun goto fail;
470*4882a593Smuzhiyun }
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun if (end != vma->vm_end) {
473*4882a593Smuzhiyun error = split_vma(mm, vma, end, 0);
474*4882a593Smuzhiyun if (error)
475*4882a593Smuzhiyun goto fail;
476*4882a593Smuzhiyun }
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun success:
479*4882a593Smuzhiyun /*
480*4882a593Smuzhiyun * vm_flags and vm_page_prot are protected by the mmap_lock
481*4882a593Smuzhiyun * held in write mode.
482*4882a593Smuzhiyun */
483*4882a593Smuzhiyun vm_write_begin(vma);
484*4882a593Smuzhiyun WRITE_ONCE(vma->vm_flags, newflags);
485*4882a593Smuzhiyun dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
486*4882a593Smuzhiyun vma_set_page_prot(vma);
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun change_protection(vma, start, end, vma->vm_page_prot,
489*4882a593Smuzhiyun dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
490*4882a593Smuzhiyun vm_write_end(vma);
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun /*
493*4882a593Smuzhiyun * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
494*4882a593Smuzhiyun * fault on access.
495*4882a593Smuzhiyun */
496*4882a593Smuzhiyun if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
497*4882a593Smuzhiyun (newflags & VM_WRITE)) {
498*4882a593Smuzhiyun populate_vma_page_range(vma, start, end, NULL);
499*4882a593Smuzhiyun }
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun vm_stat_account(mm, oldflags, -nrpages);
502*4882a593Smuzhiyun vm_stat_account(mm, newflags, nrpages);
503*4882a593Smuzhiyun perf_event_mmap(vma);
504*4882a593Smuzhiyun return 0;
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun fail:
507*4882a593Smuzhiyun vm_unacct_memory(charged);
508*4882a593Smuzhiyun return error;
509*4882a593Smuzhiyun }
510*4882a593Smuzhiyun
511*4882a593Smuzhiyun /*
512*4882a593Smuzhiyun * pkey==-1 when doing a legacy mprotect()
513*4882a593Smuzhiyun */
do_mprotect_pkey(unsigned long start,size_t len,unsigned long prot,int pkey)514*4882a593Smuzhiyun static int do_mprotect_pkey(unsigned long start, size_t len,
515*4882a593Smuzhiyun unsigned long prot, int pkey)
516*4882a593Smuzhiyun {
517*4882a593Smuzhiyun unsigned long nstart, end, tmp, reqprot;
518*4882a593Smuzhiyun struct vm_area_struct *vma, *prev;
519*4882a593Smuzhiyun int error = -EINVAL;
520*4882a593Smuzhiyun const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
521*4882a593Smuzhiyun const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
522*4882a593Smuzhiyun (prot & PROT_READ);
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun start = untagged_addr(start);
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
527*4882a593Smuzhiyun if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
528*4882a593Smuzhiyun return -EINVAL;
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun if (start & ~PAGE_MASK)
531*4882a593Smuzhiyun return -EINVAL;
532*4882a593Smuzhiyun if (!len)
533*4882a593Smuzhiyun return 0;
534*4882a593Smuzhiyun len = PAGE_ALIGN(len);
535*4882a593Smuzhiyun end = start + len;
536*4882a593Smuzhiyun if (end <= start)
537*4882a593Smuzhiyun return -ENOMEM;
538*4882a593Smuzhiyun if (!arch_validate_prot(prot, start))
539*4882a593Smuzhiyun return -EINVAL;
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun reqprot = prot;
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun if (mmap_write_lock_killable(current->mm))
544*4882a593Smuzhiyun return -EINTR;
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun /*
547*4882a593Smuzhiyun * If userspace did not allocate the pkey, do not let
548*4882a593Smuzhiyun * them use it here.
549*4882a593Smuzhiyun */
550*4882a593Smuzhiyun error = -EINVAL;
551*4882a593Smuzhiyun if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
552*4882a593Smuzhiyun goto out;
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun vma = find_vma(current->mm, start);
555*4882a593Smuzhiyun error = -ENOMEM;
556*4882a593Smuzhiyun if (!vma)
557*4882a593Smuzhiyun goto out;
558*4882a593Smuzhiyun prev = vma->vm_prev;
559*4882a593Smuzhiyun if (unlikely(grows & PROT_GROWSDOWN)) {
560*4882a593Smuzhiyun if (vma->vm_start >= end)
561*4882a593Smuzhiyun goto out;
562*4882a593Smuzhiyun start = vma->vm_start;
563*4882a593Smuzhiyun error = -EINVAL;
564*4882a593Smuzhiyun if (!(vma->vm_flags & VM_GROWSDOWN))
565*4882a593Smuzhiyun goto out;
566*4882a593Smuzhiyun } else {
567*4882a593Smuzhiyun if (vma->vm_start > start)
568*4882a593Smuzhiyun goto out;
569*4882a593Smuzhiyun if (unlikely(grows & PROT_GROWSUP)) {
570*4882a593Smuzhiyun end = vma->vm_end;
571*4882a593Smuzhiyun error = -EINVAL;
572*4882a593Smuzhiyun if (!(vma->vm_flags & VM_GROWSUP))
573*4882a593Smuzhiyun goto out;
574*4882a593Smuzhiyun }
575*4882a593Smuzhiyun }
576*4882a593Smuzhiyun if (start > vma->vm_start)
577*4882a593Smuzhiyun prev = vma;
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun for (nstart = start ; ; ) {
580*4882a593Smuzhiyun unsigned long mask_off_old_flags;
581*4882a593Smuzhiyun unsigned long newflags;
582*4882a593Smuzhiyun int new_vma_pkey;
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
585*4882a593Smuzhiyun
586*4882a593Smuzhiyun /* Does the application expect PROT_READ to imply PROT_EXEC */
587*4882a593Smuzhiyun if (rier && (vma->vm_flags & VM_MAYEXEC))
588*4882a593Smuzhiyun prot |= PROT_EXEC;
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun /*
591*4882a593Smuzhiyun * Each mprotect() call explicitly passes r/w/x permissions.
592*4882a593Smuzhiyun * If a permission is not passed to mprotect(), it must be
593*4882a593Smuzhiyun * cleared from the VMA.
594*4882a593Smuzhiyun */
595*4882a593Smuzhiyun mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
596*4882a593Smuzhiyun VM_FLAGS_CLEAR;
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
599*4882a593Smuzhiyun newflags = calc_vm_prot_bits(prot, new_vma_pkey);
600*4882a593Smuzhiyun newflags |= (vma->vm_flags & ~mask_off_old_flags);
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun /* newflags >> 4 shift VM_MAY% in place of VM_% */
603*4882a593Smuzhiyun if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
604*4882a593Smuzhiyun error = -EACCES;
605*4882a593Smuzhiyun goto out;
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun
608*4882a593Smuzhiyun /* Allow architectures to sanity-check the new flags */
609*4882a593Smuzhiyun if (!arch_validate_flags(newflags)) {
610*4882a593Smuzhiyun error = -EINVAL;
611*4882a593Smuzhiyun goto out;
612*4882a593Smuzhiyun }
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun error = security_file_mprotect(vma, reqprot, prot);
615*4882a593Smuzhiyun if (error)
616*4882a593Smuzhiyun goto out;
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun tmp = vma->vm_end;
619*4882a593Smuzhiyun if (tmp > end)
620*4882a593Smuzhiyun tmp = end;
621*4882a593Smuzhiyun error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
622*4882a593Smuzhiyun if (error)
623*4882a593Smuzhiyun goto out;
624*4882a593Smuzhiyun nstart = tmp;
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun if (nstart < prev->vm_end)
627*4882a593Smuzhiyun nstart = prev->vm_end;
628*4882a593Smuzhiyun if (nstart >= end)
629*4882a593Smuzhiyun goto out;
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun vma = prev->vm_next;
632*4882a593Smuzhiyun if (!vma || vma->vm_start != nstart) {
633*4882a593Smuzhiyun error = -ENOMEM;
634*4882a593Smuzhiyun goto out;
635*4882a593Smuzhiyun }
636*4882a593Smuzhiyun prot = reqprot;
637*4882a593Smuzhiyun }
638*4882a593Smuzhiyun out:
639*4882a593Smuzhiyun mmap_write_unlock(current->mm);
640*4882a593Smuzhiyun return error;
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun
SYSCALL_DEFINE3(mprotect,unsigned long,start,size_t,len,unsigned long,prot)643*4882a593Smuzhiyun SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
644*4882a593Smuzhiyun unsigned long, prot)
645*4882a593Smuzhiyun {
646*4882a593Smuzhiyun return do_mprotect_pkey(start, len, prot, -1);
647*4882a593Smuzhiyun }
648*4882a593Smuzhiyun
649*4882a593Smuzhiyun #ifdef CONFIG_ARCH_HAS_PKEYS
650*4882a593Smuzhiyun
SYSCALL_DEFINE4(pkey_mprotect,unsigned long,start,size_t,len,unsigned long,prot,int,pkey)651*4882a593Smuzhiyun SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
652*4882a593Smuzhiyun unsigned long, prot, int, pkey)
653*4882a593Smuzhiyun {
654*4882a593Smuzhiyun return do_mprotect_pkey(start, len, prot, pkey);
655*4882a593Smuzhiyun }
656*4882a593Smuzhiyun
SYSCALL_DEFINE2(pkey_alloc,unsigned long,flags,unsigned long,init_val)657*4882a593Smuzhiyun SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
658*4882a593Smuzhiyun {
659*4882a593Smuzhiyun int pkey;
660*4882a593Smuzhiyun int ret;
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun /* No flags supported yet. */
663*4882a593Smuzhiyun if (flags)
664*4882a593Smuzhiyun return -EINVAL;
665*4882a593Smuzhiyun /* check for unsupported init values */
666*4882a593Smuzhiyun if (init_val & ~PKEY_ACCESS_MASK)
667*4882a593Smuzhiyun return -EINVAL;
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun mmap_write_lock(current->mm);
670*4882a593Smuzhiyun pkey = mm_pkey_alloc(current->mm);
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun ret = -ENOSPC;
673*4882a593Smuzhiyun if (pkey == -1)
674*4882a593Smuzhiyun goto out;
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun ret = arch_set_user_pkey_access(current, pkey, init_val);
677*4882a593Smuzhiyun if (ret) {
678*4882a593Smuzhiyun mm_pkey_free(current->mm, pkey);
679*4882a593Smuzhiyun goto out;
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun ret = pkey;
682*4882a593Smuzhiyun out:
683*4882a593Smuzhiyun mmap_write_unlock(current->mm);
684*4882a593Smuzhiyun return ret;
685*4882a593Smuzhiyun }
686*4882a593Smuzhiyun
SYSCALL_DEFINE1(pkey_free,int,pkey)687*4882a593Smuzhiyun SYSCALL_DEFINE1(pkey_free, int, pkey)
688*4882a593Smuzhiyun {
689*4882a593Smuzhiyun int ret;
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun mmap_write_lock(current->mm);
692*4882a593Smuzhiyun ret = mm_pkey_free(current->mm, pkey);
693*4882a593Smuzhiyun mmap_write_unlock(current->mm);
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun /*
696*4882a593Smuzhiyun * We could provie warnings or errors if any VMA still
697*4882a593Smuzhiyun * has the pkey set here.
698*4882a593Smuzhiyun */
699*4882a593Smuzhiyun return ret;
700*4882a593Smuzhiyun }
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun #endif /* CONFIG_ARCH_HAS_PKEYS */
703