1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright 2007-2008 Paul Mackerras, IBM Corp.
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun #include <linux/errno.h>
7*4882a593Smuzhiyun #include <linux/kernel.h>
8*4882a593Smuzhiyun #include <linux/gfp.h>
9*4882a593Smuzhiyun #include <linux/types.h>
10*4882a593Smuzhiyun #include <linux/pagewalk.h>
11*4882a593Smuzhiyun #include <linux/hugetlb.h>
12*4882a593Smuzhiyun #include <linux/syscalls.h>
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun #include <linux/pgtable.h>
15*4882a593Smuzhiyun #include <linux/uaccess.h>
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun /*
18*4882a593Smuzhiyun * Free all pages allocated for subpage protection maps and pointers.
19*4882a593Smuzhiyun * Also makes sure that the subpage_prot_table structure is
20*4882a593Smuzhiyun * reinitialized for the next user.
21*4882a593Smuzhiyun */
subpage_prot_free(struct mm_struct * mm)22*4882a593Smuzhiyun void subpage_prot_free(struct mm_struct *mm)
23*4882a593Smuzhiyun {
24*4882a593Smuzhiyun struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
25*4882a593Smuzhiyun unsigned long i, j, addr;
26*4882a593Smuzhiyun u32 **p;
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun if (!spt)
29*4882a593Smuzhiyun return;
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun for (i = 0; i < 4; ++i) {
32*4882a593Smuzhiyun if (spt->low_prot[i]) {
33*4882a593Smuzhiyun free_page((unsigned long)spt->low_prot[i]);
34*4882a593Smuzhiyun spt->low_prot[i] = NULL;
35*4882a593Smuzhiyun }
36*4882a593Smuzhiyun }
37*4882a593Smuzhiyun addr = 0;
38*4882a593Smuzhiyun for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
39*4882a593Smuzhiyun p = spt->protptrs[i];
40*4882a593Smuzhiyun if (!p)
41*4882a593Smuzhiyun continue;
42*4882a593Smuzhiyun spt->protptrs[i] = NULL;
43*4882a593Smuzhiyun for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
44*4882a593Smuzhiyun ++j, addr += PAGE_SIZE)
45*4882a593Smuzhiyun if (p[j])
46*4882a593Smuzhiyun free_page((unsigned long)p[j]);
47*4882a593Smuzhiyun free_page((unsigned long)p);
48*4882a593Smuzhiyun }
49*4882a593Smuzhiyun spt->maxaddr = 0;
50*4882a593Smuzhiyun kfree(spt);
51*4882a593Smuzhiyun }
52*4882a593Smuzhiyun
hpte_flush_range(struct mm_struct * mm,unsigned long addr,int npages)53*4882a593Smuzhiyun static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
54*4882a593Smuzhiyun int npages)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun pgd_t *pgd;
57*4882a593Smuzhiyun p4d_t *p4d;
58*4882a593Smuzhiyun pud_t *pud;
59*4882a593Smuzhiyun pmd_t *pmd;
60*4882a593Smuzhiyun pte_t *pte;
61*4882a593Smuzhiyun spinlock_t *ptl;
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun pgd = pgd_offset(mm, addr);
64*4882a593Smuzhiyun p4d = p4d_offset(pgd, addr);
65*4882a593Smuzhiyun if (p4d_none(*p4d))
66*4882a593Smuzhiyun return;
67*4882a593Smuzhiyun pud = pud_offset(p4d, addr);
68*4882a593Smuzhiyun if (pud_none(*pud))
69*4882a593Smuzhiyun return;
70*4882a593Smuzhiyun pmd = pmd_offset(pud, addr);
71*4882a593Smuzhiyun if (pmd_none(*pmd))
72*4882a593Smuzhiyun return;
73*4882a593Smuzhiyun pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
74*4882a593Smuzhiyun arch_enter_lazy_mmu_mode();
75*4882a593Smuzhiyun for (; npages > 0; --npages) {
76*4882a593Smuzhiyun pte_update(mm, addr, pte, 0, 0, 0);
77*4882a593Smuzhiyun addr += PAGE_SIZE;
78*4882a593Smuzhiyun ++pte;
79*4882a593Smuzhiyun }
80*4882a593Smuzhiyun arch_leave_lazy_mmu_mode();
81*4882a593Smuzhiyun pte_unmap_unlock(pte - 1, ptl);
82*4882a593Smuzhiyun }
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun /*
85*4882a593Smuzhiyun * Clear the subpage protection map for an address range, allowing
86*4882a593Smuzhiyun * all accesses that are allowed by the pte permissions.
87*4882a593Smuzhiyun */
subpage_prot_clear(unsigned long addr,unsigned long len)88*4882a593Smuzhiyun static void subpage_prot_clear(unsigned long addr, unsigned long len)
89*4882a593Smuzhiyun {
90*4882a593Smuzhiyun struct mm_struct *mm = current->mm;
91*4882a593Smuzhiyun struct subpage_prot_table *spt;
92*4882a593Smuzhiyun u32 **spm, *spp;
93*4882a593Smuzhiyun unsigned long i;
94*4882a593Smuzhiyun size_t nw;
95*4882a593Smuzhiyun unsigned long next, limit;
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun mmap_write_lock(mm);
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun spt = mm_ctx_subpage_prot(&mm->context);
100*4882a593Smuzhiyun if (!spt)
101*4882a593Smuzhiyun goto err_out;
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun limit = addr + len;
104*4882a593Smuzhiyun if (limit > spt->maxaddr)
105*4882a593Smuzhiyun limit = spt->maxaddr;
106*4882a593Smuzhiyun for (; addr < limit; addr = next) {
107*4882a593Smuzhiyun next = pmd_addr_end(addr, limit);
108*4882a593Smuzhiyun if (addr < 0x100000000UL) {
109*4882a593Smuzhiyun spm = spt->low_prot;
110*4882a593Smuzhiyun } else {
111*4882a593Smuzhiyun spm = spt->protptrs[addr >> SBP_L3_SHIFT];
112*4882a593Smuzhiyun if (!spm)
113*4882a593Smuzhiyun continue;
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
116*4882a593Smuzhiyun if (!spp)
117*4882a593Smuzhiyun continue;
118*4882a593Smuzhiyun spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
121*4882a593Smuzhiyun nw = PTRS_PER_PTE - i;
122*4882a593Smuzhiyun if (addr + (nw << PAGE_SHIFT) > next)
123*4882a593Smuzhiyun nw = (next - addr) >> PAGE_SHIFT;
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun memset(spp, 0, nw * sizeof(u32));
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun /* now flush any existing HPTEs for the range */
128*4882a593Smuzhiyun hpte_flush_range(mm, addr, nw);
129*4882a593Smuzhiyun }
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun err_out:
132*4882a593Smuzhiyun mmap_write_unlock(mm);
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun #ifdef CONFIG_TRANSPARENT_HUGEPAGE
subpage_walk_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)136*4882a593Smuzhiyun static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
137*4882a593Smuzhiyun unsigned long end, struct mm_walk *walk)
138*4882a593Smuzhiyun {
139*4882a593Smuzhiyun struct vm_area_struct *vma = walk->vma;
140*4882a593Smuzhiyun split_huge_pmd(vma, pmd, addr);
141*4882a593Smuzhiyun return 0;
142*4882a593Smuzhiyun }
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun static const struct mm_walk_ops subpage_walk_ops = {
145*4882a593Smuzhiyun .pmd_entry = subpage_walk_pmd_entry,
146*4882a593Smuzhiyun };
147*4882a593Smuzhiyun
subpage_mark_vma_nohuge(struct mm_struct * mm,unsigned long addr,unsigned long len)148*4882a593Smuzhiyun static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
149*4882a593Smuzhiyun unsigned long len)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun struct vm_area_struct *vma;
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun /*
154*4882a593Smuzhiyun * We don't try too hard, we just mark all the vma in that range
155*4882a593Smuzhiyun * VM_NOHUGEPAGE and split them.
156*4882a593Smuzhiyun */
157*4882a593Smuzhiyun vma = find_vma(mm, addr);
158*4882a593Smuzhiyun /*
159*4882a593Smuzhiyun * If the range is in unmapped range, just return
160*4882a593Smuzhiyun */
161*4882a593Smuzhiyun if (vma && ((addr + len) <= vma->vm_start))
162*4882a593Smuzhiyun return;
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun while (vma) {
165*4882a593Smuzhiyun if (vma->vm_start >= (addr + len))
166*4882a593Smuzhiyun break;
167*4882a593Smuzhiyun vma->vm_flags |= VM_NOHUGEPAGE;
168*4882a593Smuzhiyun walk_page_vma(vma, &subpage_walk_ops, NULL);
169*4882a593Smuzhiyun vma = vma->vm_next;
170*4882a593Smuzhiyun }
171*4882a593Smuzhiyun }
172*4882a593Smuzhiyun #else
subpage_mark_vma_nohuge(struct mm_struct * mm,unsigned long addr,unsigned long len)173*4882a593Smuzhiyun static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
174*4882a593Smuzhiyun unsigned long len)
175*4882a593Smuzhiyun {
176*4882a593Smuzhiyun return;
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun #endif
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun /*
181*4882a593Smuzhiyun * Copy in a subpage protection map for an address range.
182*4882a593Smuzhiyun * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
183*4882a593Smuzhiyun * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
184*4882a593Smuzhiyun * 2 or 3 to prevent all accesses.
185*4882a593Smuzhiyun * Note that the normal page protections also apply; the subpage
186*4882a593Smuzhiyun * protection mechanism is an additional constraint, so putting 0
187*4882a593Smuzhiyun * in a 2-bit field won't allow writes to a page that is otherwise
188*4882a593Smuzhiyun * write-protected.
189*4882a593Smuzhiyun */
SYSCALL_DEFINE3(subpage_prot,unsigned long,addr,unsigned long,len,u32 __user *,map)190*4882a593Smuzhiyun SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
191*4882a593Smuzhiyun unsigned long, len, u32 __user *, map)
192*4882a593Smuzhiyun {
193*4882a593Smuzhiyun struct mm_struct *mm = current->mm;
194*4882a593Smuzhiyun struct subpage_prot_table *spt;
195*4882a593Smuzhiyun u32 **spm, *spp;
196*4882a593Smuzhiyun unsigned long i;
197*4882a593Smuzhiyun size_t nw;
198*4882a593Smuzhiyun unsigned long next, limit;
199*4882a593Smuzhiyun int err;
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun if (radix_enabled())
202*4882a593Smuzhiyun return -ENOENT;
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /* Check parameters */
205*4882a593Smuzhiyun if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
206*4882a593Smuzhiyun addr >= mm->task_size || len >= mm->task_size ||
207*4882a593Smuzhiyun addr + len > mm->task_size)
208*4882a593Smuzhiyun return -EINVAL;
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun if (is_hugepage_only_range(mm, addr, len))
211*4882a593Smuzhiyun return -EINVAL;
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun if (!map) {
214*4882a593Smuzhiyun /* Clear out the protection map for the address range */
215*4882a593Smuzhiyun subpage_prot_clear(addr, len);
216*4882a593Smuzhiyun return 0;
217*4882a593Smuzhiyun }
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
220*4882a593Smuzhiyun return -EFAULT;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun mmap_write_lock(mm);
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun spt = mm_ctx_subpage_prot(&mm->context);
225*4882a593Smuzhiyun if (!spt) {
226*4882a593Smuzhiyun /*
227*4882a593Smuzhiyun * Allocate subpage prot table if not already done.
228*4882a593Smuzhiyun * Do this with mmap_lock held
229*4882a593Smuzhiyun */
230*4882a593Smuzhiyun spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
231*4882a593Smuzhiyun if (!spt) {
232*4882a593Smuzhiyun err = -ENOMEM;
233*4882a593Smuzhiyun goto out;
234*4882a593Smuzhiyun }
235*4882a593Smuzhiyun mm->context.hash_context->spt = spt;
236*4882a593Smuzhiyun }
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun subpage_mark_vma_nohuge(mm, addr, len);
239*4882a593Smuzhiyun for (limit = addr + len; addr < limit; addr = next) {
240*4882a593Smuzhiyun next = pmd_addr_end(addr, limit);
241*4882a593Smuzhiyun err = -ENOMEM;
242*4882a593Smuzhiyun if (addr < 0x100000000UL) {
243*4882a593Smuzhiyun spm = spt->low_prot;
244*4882a593Smuzhiyun } else {
245*4882a593Smuzhiyun spm = spt->protptrs[addr >> SBP_L3_SHIFT];
246*4882a593Smuzhiyun if (!spm) {
247*4882a593Smuzhiyun spm = (u32 **)get_zeroed_page(GFP_KERNEL);
248*4882a593Smuzhiyun if (!spm)
249*4882a593Smuzhiyun goto out;
250*4882a593Smuzhiyun spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
251*4882a593Smuzhiyun }
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
254*4882a593Smuzhiyun spp = *spm;
255*4882a593Smuzhiyun if (!spp) {
256*4882a593Smuzhiyun spp = (u32 *)get_zeroed_page(GFP_KERNEL);
257*4882a593Smuzhiyun if (!spp)
258*4882a593Smuzhiyun goto out;
259*4882a593Smuzhiyun *spm = spp;
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun local_irq_disable();
264*4882a593Smuzhiyun demote_segment_4k(mm, addr);
265*4882a593Smuzhiyun local_irq_enable();
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
268*4882a593Smuzhiyun nw = PTRS_PER_PTE - i;
269*4882a593Smuzhiyun if (addr + (nw << PAGE_SHIFT) > next)
270*4882a593Smuzhiyun nw = (next - addr) >> PAGE_SHIFT;
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun mmap_write_unlock(mm);
273*4882a593Smuzhiyun if (__copy_from_user(spp, map, nw * sizeof(u32)))
274*4882a593Smuzhiyun return -EFAULT;
275*4882a593Smuzhiyun map += nw;
276*4882a593Smuzhiyun mmap_write_lock(mm);
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun /* now flush any existing HPTEs for the range */
279*4882a593Smuzhiyun hpte_flush_range(mm, addr, nw);
280*4882a593Smuzhiyun }
281*4882a593Smuzhiyun if (limit > spt->maxaddr)
282*4882a593Smuzhiyun spt->maxaddr = limit;
283*4882a593Smuzhiyun err = 0;
284*4882a593Smuzhiyun out:
285*4882a593Smuzhiyun mmap_write_unlock(mm);
286*4882a593Smuzhiyun return err;
287*4882a593Smuzhiyun }
288