1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * MMU context allocation for 64-bit kernels.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun #include <linux/sched.h>
9*4882a593Smuzhiyun #include <linux/kernel.h>
10*4882a593Smuzhiyun #include <linux/errno.h>
11*4882a593Smuzhiyun #include <linux/string.h>
12*4882a593Smuzhiyun #include <linux/types.h>
13*4882a593Smuzhiyun #include <linux/mm.h>
14*4882a593Smuzhiyun #include <linux/pkeys.h>
15*4882a593Smuzhiyun #include <linux/spinlock.h>
16*4882a593Smuzhiyun #include <linux/idr.h>
17*4882a593Smuzhiyun #include <linux/export.h>
18*4882a593Smuzhiyun #include <linux/gfp.h>
19*4882a593Smuzhiyun #include <linux/slab.h>
20*4882a593Smuzhiyun #include <linux/cpu.h>
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun #include <asm/mmu_context.h>
23*4882a593Smuzhiyun #include <asm/pgalloc.h>
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun #include "internal.h"
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun static DEFINE_IDA(mmu_context_ida);
28*4882a593Smuzhiyun
alloc_context_id(int min_id,int max_id)29*4882a593Smuzhiyun static int alloc_context_id(int min_id, int max_id)
30*4882a593Smuzhiyun {
31*4882a593Smuzhiyun return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
32*4882a593Smuzhiyun }
33*4882a593Smuzhiyun
hash__reserve_context_id(int id)34*4882a593Smuzhiyun void hash__reserve_context_id(int id)
35*4882a593Smuzhiyun {
36*4882a593Smuzhiyun int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
39*4882a593Smuzhiyun }
40*4882a593Smuzhiyun
hash__alloc_context_id(void)41*4882a593Smuzhiyun int hash__alloc_context_id(void)
42*4882a593Smuzhiyun {
43*4882a593Smuzhiyun unsigned long max;
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun if (mmu_has_feature(MMU_FTR_68_BIT_VA))
46*4882a593Smuzhiyun max = MAX_USER_CONTEXT;
47*4882a593Smuzhiyun else
48*4882a593Smuzhiyun max = MAX_USER_CONTEXT_65BIT_VA;
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun return alloc_context_id(MIN_USER_CONTEXT, max);
51*4882a593Smuzhiyun }
52*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(hash__alloc_context_id);
53*4882a593Smuzhiyun
realloc_context_ids(mm_context_t * ctx)54*4882a593Smuzhiyun static int realloc_context_ids(mm_context_t *ctx)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun int i, id;
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun /*
59*4882a593Smuzhiyun * id 0 (aka. ctx->id) is special, we always allocate a new one, even if
60*4882a593Smuzhiyun * there wasn't one allocated previously (which happens in the exec
61*4882a593Smuzhiyun * case where ctx is newly allocated).
62*4882a593Smuzhiyun *
63*4882a593Smuzhiyun * We have to be a bit careful here. We must keep the existing ids in
64*4882a593Smuzhiyun * the array, so that we can test if they're non-zero to decide if we
65*4882a593Smuzhiyun * need to allocate a new one. However in case of error we must free the
66*4882a593Smuzhiyun * ids we've allocated but *not* any of the existing ones (or risk a
67*4882a593Smuzhiyun * UAF). That's why we decrement i at the start of the error handling
68*4882a593Smuzhiyun * loop, to skip the id that we just tested but couldn't reallocate.
69*4882a593Smuzhiyun */
70*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(ctx->extended_id); i++) {
71*4882a593Smuzhiyun if (i == 0 || ctx->extended_id[i]) {
72*4882a593Smuzhiyun id = hash__alloc_context_id();
73*4882a593Smuzhiyun if (id < 0)
74*4882a593Smuzhiyun goto error;
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun ctx->extended_id[i] = id;
77*4882a593Smuzhiyun }
78*4882a593Smuzhiyun }
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun /* The caller expects us to return id */
81*4882a593Smuzhiyun return ctx->id;
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun error:
84*4882a593Smuzhiyun for (i--; i >= 0; i--) {
85*4882a593Smuzhiyun if (ctx->extended_id[i])
86*4882a593Smuzhiyun ida_free(&mmu_context_ida, ctx->extended_id[i]);
87*4882a593Smuzhiyun }
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun return id;
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun
hash__init_new_context(struct mm_struct * mm)92*4882a593Smuzhiyun static int hash__init_new_context(struct mm_struct *mm)
93*4882a593Smuzhiyun {
94*4882a593Smuzhiyun int index;
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
97*4882a593Smuzhiyun GFP_KERNEL);
98*4882a593Smuzhiyun if (!mm->context.hash_context)
99*4882a593Smuzhiyun return -ENOMEM;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun /*
102*4882a593Smuzhiyun * The old code would re-promote on fork, we don't do that when using
103*4882a593Smuzhiyun * slices as it could cause problem promoting slices that have been
104*4882a593Smuzhiyun * forced down to 4K.
105*4882a593Smuzhiyun *
106*4882a593Smuzhiyun * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
107*4882a593Smuzhiyun * explicitly against context.id == 0. This ensures that we properly
108*4882a593Smuzhiyun * initialize context slice details for newly allocated mm's (which will
109*4882a593Smuzhiyun * have id == 0) and don't alter context slice inherited via fork (which
110*4882a593Smuzhiyun * will have id != 0).
111*4882a593Smuzhiyun *
112*4882a593Smuzhiyun * We should not be calling init_new_context() on init_mm. Hence a
113*4882a593Smuzhiyun * check against 0 is OK.
114*4882a593Smuzhiyun */
115*4882a593Smuzhiyun if (mm->context.id == 0) {
116*4882a593Smuzhiyun memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
117*4882a593Smuzhiyun slice_init_new_context_exec(mm);
118*4882a593Smuzhiyun } else {
119*4882a593Smuzhiyun /* This is fork. Copy hash_context details from current->mm */
120*4882a593Smuzhiyun memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
121*4882a593Smuzhiyun #ifdef CONFIG_PPC_SUBPAGE_PROT
122*4882a593Smuzhiyun /* inherit subpage prot detalis if we have one. */
123*4882a593Smuzhiyun if (current->mm->context.hash_context->spt) {
124*4882a593Smuzhiyun mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
125*4882a593Smuzhiyun GFP_KERNEL);
126*4882a593Smuzhiyun if (!mm->context.hash_context->spt) {
127*4882a593Smuzhiyun kfree(mm->context.hash_context);
128*4882a593Smuzhiyun return -ENOMEM;
129*4882a593Smuzhiyun }
130*4882a593Smuzhiyun }
131*4882a593Smuzhiyun #endif
132*4882a593Smuzhiyun }
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun index = realloc_context_ids(&mm->context);
135*4882a593Smuzhiyun if (index < 0) {
136*4882a593Smuzhiyun #ifdef CONFIG_PPC_SUBPAGE_PROT
137*4882a593Smuzhiyun kfree(mm->context.hash_context->spt);
138*4882a593Smuzhiyun #endif
139*4882a593Smuzhiyun kfree(mm->context.hash_context);
140*4882a593Smuzhiyun return index;
141*4882a593Smuzhiyun }
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun pkey_mm_init(mm);
144*4882a593Smuzhiyun return index;
145*4882a593Smuzhiyun }
146*4882a593Smuzhiyun
hash__setup_new_exec(void)147*4882a593Smuzhiyun void hash__setup_new_exec(void)
148*4882a593Smuzhiyun {
149*4882a593Smuzhiyun slice_setup_new_exec();
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun slb_setup_new_exec();
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun
radix__init_new_context(struct mm_struct * mm)154*4882a593Smuzhiyun static int radix__init_new_context(struct mm_struct *mm)
155*4882a593Smuzhiyun {
156*4882a593Smuzhiyun unsigned long rts_field;
157*4882a593Smuzhiyun int index, max_id;
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun max_id = (1 << mmu_pid_bits) - 1;
160*4882a593Smuzhiyun index = alloc_context_id(mmu_base_pid, max_id);
161*4882a593Smuzhiyun if (index < 0)
162*4882a593Smuzhiyun return index;
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun /*
165*4882a593Smuzhiyun * set the process table entry,
166*4882a593Smuzhiyun */
167*4882a593Smuzhiyun rts_field = radix__get_tree_size();
168*4882a593Smuzhiyun process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun /*
171*4882a593Smuzhiyun * Order the above store with subsequent update of the PID
172*4882a593Smuzhiyun * register (at which point HW can start loading/caching
173*4882a593Smuzhiyun * the entry) and the corresponding load by the MMU from
174*4882a593Smuzhiyun * the L2 cache.
175*4882a593Smuzhiyun */
176*4882a593Smuzhiyun asm volatile("ptesync;isync" : : : "memory");
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun mm->context.hash_context = NULL;
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun return index;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
init_new_context(struct task_struct * tsk,struct mm_struct * mm)183*4882a593Smuzhiyun int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
184*4882a593Smuzhiyun {
185*4882a593Smuzhiyun int index;
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun if (radix_enabled())
188*4882a593Smuzhiyun index = radix__init_new_context(mm);
189*4882a593Smuzhiyun else
190*4882a593Smuzhiyun index = hash__init_new_context(mm);
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun if (index < 0)
193*4882a593Smuzhiyun return index;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun mm->context.id = index;
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun mm->context.pte_frag = NULL;
198*4882a593Smuzhiyun mm->context.pmd_frag = NULL;
199*4882a593Smuzhiyun #ifdef CONFIG_SPAPR_TCE_IOMMU
200*4882a593Smuzhiyun mm_iommu_init(mm);
201*4882a593Smuzhiyun #endif
202*4882a593Smuzhiyun atomic_set(&mm->context.active_cpus, 0);
203*4882a593Smuzhiyun atomic_set(&mm->context.copros, 0);
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun return 0;
206*4882a593Smuzhiyun }
207*4882a593Smuzhiyun
__destroy_context(int context_id)208*4882a593Smuzhiyun void __destroy_context(int context_id)
209*4882a593Smuzhiyun {
210*4882a593Smuzhiyun ida_free(&mmu_context_ida, context_id);
211*4882a593Smuzhiyun }
212*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__destroy_context);
213*4882a593Smuzhiyun
destroy_contexts(mm_context_t * ctx)214*4882a593Smuzhiyun static void destroy_contexts(mm_context_t *ctx)
215*4882a593Smuzhiyun {
216*4882a593Smuzhiyun int index, context_id;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
219*4882a593Smuzhiyun context_id = ctx->extended_id[index];
220*4882a593Smuzhiyun if (context_id)
221*4882a593Smuzhiyun ida_free(&mmu_context_ida, context_id);
222*4882a593Smuzhiyun }
223*4882a593Smuzhiyun kfree(ctx->hash_context);
224*4882a593Smuzhiyun }
225*4882a593Smuzhiyun
pmd_frag_destroy(void * pmd_frag)226*4882a593Smuzhiyun static void pmd_frag_destroy(void *pmd_frag)
227*4882a593Smuzhiyun {
228*4882a593Smuzhiyun int count;
229*4882a593Smuzhiyun struct page *page;
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun page = virt_to_page(pmd_frag);
232*4882a593Smuzhiyun /* drop all the pending references */
233*4882a593Smuzhiyun count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
234*4882a593Smuzhiyun /* We allow PTE_FRAG_NR fragments from a PTE page */
235*4882a593Smuzhiyun if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
236*4882a593Smuzhiyun pgtable_pmd_page_dtor(page);
237*4882a593Smuzhiyun __free_page(page);
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
destroy_pagetable_cache(struct mm_struct * mm)241*4882a593Smuzhiyun static void destroy_pagetable_cache(struct mm_struct *mm)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun void *frag;
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun frag = mm->context.pte_frag;
246*4882a593Smuzhiyun if (frag)
247*4882a593Smuzhiyun pte_frag_destroy(frag);
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun frag = mm->context.pmd_frag;
250*4882a593Smuzhiyun if (frag)
251*4882a593Smuzhiyun pmd_frag_destroy(frag);
252*4882a593Smuzhiyun return;
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun
destroy_context(struct mm_struct * mm)255*4882a593Smuzhiyun void destroy_context(struct mm_struct *mm)
256*4882a593Smuzhiyun {
257*4882a593Smuzhiyun #ifdef CONFIG_SPAPR_TCE_IOMMU
258*4882a593Smuzhiyun WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
259*4882a593Smuzhiyun #endif
260*4882a593Smuzhiyun /*
261*4882a593Smuzhiyun * For tasks which were successfully initialized we end up calling
262*4882a593Smuzhiyun * arch_exit_mmap() which clears the process table entry. And
263*4882a593Smuzhiyun * arch_exit_mmap() is called before the required fullmm TLB flush
264*4882a593Smuzhiyun * which does a RIC=2 flush. Hence for an initialized task, we do clear
265*4882a593Smuzhiyun * any cached process table entries.
266*4882a593Smuzhiyun *
267*4882a593Smuzhiyun * The condition below handles the error case during task init. We have
268*4882a593Smuzhiyun * set the process table entry early and if we fail a task
269*4882a593Smuzhiyun * initialization, we need to ensure the process table entry is zeroed.
270*4882a593Smuzhiyun * We need not worry about process table entry caches because the task
271*4882a593Smuzhiyun * never ran with the PID value.
272*4882a593Smuzhiyun */
273*4882a593Smuzhiyun if (radix_enabled())
274*4882a593Smuzhiyun process_tb[mm->context.id].prtb0 = 0;
275*4882a593Smuzhiyun else
276*4882a593Smuzhiyun subpage_prot_free(mm);
277*4882a593Smuzhiyun destroy_contexts(&mm->context);
278*4882a593Smuzhiyun mm->context.id = MMU_NO_CONTEXT;
279*4882a593Smuzhiyun }
280*4882a593Smuzhiyun
arch_exit_mmap(struct mm_struct * mm)281*4882a593Smuzhiyun void arch_exit_mmap(struct mm_struct *mm)
282*4882a593Smuzhiyun {
283*4882a593Smuzhiyun destroy_pagetable_cache(mm);
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun if (radix_enabled()) {
286*4882a593Smuzhiyun /*
287*4882a593Smuzhiyun * Radix doesn't have a valid bit in the process table
288*4882a593Smuzhiyun * entries. However we know that at least P9 implementation
289*4882a593Smuzhiyun * will avoid caching an entry with an invalid RTS field,
290*4882a593Smuzhiyun * and 0 is invalid. So this will do.
291*4882a593Smuzhiyun *
292*4882a593Smuzhiyun * This runs before the "fullmm" tlb flush in exit_mmap,
293*4882a593Smuzhiyun * which does a RIC=2 tlbie to clear the process table
294*4882a593Smuzhiyun * entry. See the "fullmm" comments in tlb-radix.c.
295*4882a593Smuzhiyun *
296*4882a593Smuzhiyun * No barrier required here after the store because
297*4882a593Smuzhiyun * this process will do the invalidate, which starts with
298*4882a593Smuzhiyun * ptesync.
299*4882a593Smuzhiyun */
300*4882a593Smuzhiyun process_tb[mm->context.id].prtb0 = 0;
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun #ifdef CONFIG_PPC_RADIX_MMU
radix__switch_mmu_context(struct mm_struct * prev,struct mm_struct * next)305*4882a593Smuzhiyun void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
306*4882a593Smuzhiyun {
307*4882a593Smuzhiyun mtspr(SPRN_PID, next->context.id);
308*4882a593Smuzhiyun isync();
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun #endif
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun /**
313*4882a593Smuzhiyun * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined)
314*4882a593Smuzhiyun *
315*4882a593Smuzhiyun * This clears the CPU from mm_cpumask for all processes, and then flushes the
316*4882a593Smuzhiyun * local TLB to ensure TLB coherency in case the CPU is onlined again.
317*4882a593Smuzhiyun *
318*4882a593Smuzhiyun * KVM guest translations are not necessarily flushed here. If KVM started
319*4882a593Smuzhiyun * using mm_cpumask or the Linux APIs which do, this would have to be resolved.
320*4882a593Smuzhiyun */
321*4882a593Smuzhiyun #ifdef CONFIG_HOTPLUG_CPU
cleanup_cpu_mmu_context(void)322*4882a593Smuzhiyun void cleanup_cpu_mmu_context(void)
323*4882a593Smuzhiyun {
324*4882a593Smuzhiyun int cpu = smp_processor_id();
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun clear_tasks_mm_cpumask(cpu);
327*4882a593Smuzhiyun tlbiel_all();
328*4882a593Smuzhiyun }
329*4882a593Smuzhiyun #endif
330