xref: /OK3568_Linux_fs/kernel/arch/powerpc/kvm/book3s_64_mmu_radix.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #include <linux/types.h>
8*4882a593Smuzhiyun #include <linux/string.h>
9*4882a593Smuzhiyun #include <linux/kvm.h>
10*4882a593Smuzhiyun #include <linux/kvm_host.h>
11*4882a593Smuzhiyun #include <linux/anon_inodes.h>
12*4882a593Smuzhiyun #include <linux/file.h>
13*4882a593Smuzhiyun #include <linux/debugfs.h>
14*4882a593Smuzhiyun #include <linux/pgtable.h>
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun #include <asm/kvm_ppc.h>
17*4882a593Smuzhiyun #include <asm/kvm_book3s.h>
18*4882a593Smuzhiyun #include <asm/page.h>
19*4882a593Smuzhiyun #include <asm/mmu.h>
20*4882a593Smuzhiyun #include <asm/pgalloc.h>
21*4882a593Smuzhiyun #include <asm/pte-walk.h>
22*4882a593Smuzhiyun #include <asm/ultravisor.h>
23*4882a593Smuzhiyun #include <asm/kvm_book3s_uvmem.h>
24*4882a593Smuzhiyun 
25*4882a593Smuzhiyun /*
26*4882a593Smuzhiyun  * Supported radix tree geometry.
27*4882a593Smuzhiyun  * Like p9, we support either 5 or 9 bits at the first (lowest) level,
28*4882a593Smuzhiyun  * for a page size of 64k or 4k.
29*4882a593Smuzhiyun  */
30*4882a593Smuzhiyun static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
31*4882a593Smuzhiyun 
__kvmhv_copy_tofrom_guest_radix(int lpid,int pid,gva_t eaddr,void * to,void * from,unsigned long n)32*4882a593Smuzhiyun unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
33*4882a593Smuzhiyun 					      gva_t eaddr, void *to, void *from,
34*4882a593Smuzhiyun 					      unsigned long n)
35*4882a593Smuzhiyun {
36*4882a593Smuzhiyun 	int old_pid, old_lpid;
37*4882a593Smuzhiyun 	unsigned long quadrant, ret = n;
38*4882a593Smuzhiyun 	bool is_load = !!to;
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun 	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
41*4882a593Smuzhiyun 	if (kvmhv_on_pseries())
42*4882a593Smuzhiyun 		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
43*4882a593Smuzhiyun 					  (to != NULL) ? __pa(to): 0,
44*4882a593Smuzhiyun 					  (from != NULL) ? __pa(from): 0, n);
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun 	quadrant = 1;
47*4882a593Smuzhiyun 	if (!pid)
48*4882a593Smuzhiyun 		quadrant = 2;
49*4882a593Smuzhiyun 	if (is_load)
50*4882a593Smuzhiyun 		from = (void *) (eaddr | (quadrant << 62));
51*4882a593Smuzhiyun 	else
52*4882a593Smuzhiyun 		to = (void *) (eaddr | (quadrant << 62));
53*4882a593Smuzhiyun 
54*4882a593Smuzhiyun 	preempt_disable();
55*4882a593Smuzhiyun 
56*4882a593Smuzhiyun 	/* switch the lpid first to avoid running host with unallocated pid */
57*4882a593Smuzhiyun 	old_lpid = mfspr(SPRN_LPID);
58*4882a593Smuzhiyun 	if (old_lpid != lpid)
59*4882a593Smuzhiyun 		mtspr(SPRN_LPID, lpid);
60*4882a593Smuzhiyun 	if (quadrant == 1) {
61*4882a593Smuzhiyun 		old_pid = mfspr(SPRN_PID);
62*4882a593Smuzhiyun 		if (old_pid != pid)
63*4882a593Smuzhiyun 			mtspr(SPRN_PID, pid);
64*4882a593Smuzhiyun 	}
65*4882a593Smuzhiyun 	isync();
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun 	pagefault_disable();
68*4882a593Smuzhiyun 	if (is_load)
69*4882a593Smuzhiyun 		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
70*4882a593Smuzhiyun 	else
71*4882a593Smuzhiyun 		ret = __copy_to_user_inatomic((void __user *)to, from, n);
72*4882a593Smuzhiyun 	pagefault_enable();
73*4882a593Smuzhiyun 
74*4882a593Smuzhiyun 	/* switch the pid first to avoid running host with unallocated pid */
75*4882a593Smuzhiyun 	if (quadrant == 1 && pid != old_pid)
76*4882a593Smuzhiyun 		mtspr(SPRN_PID, old_pid);
77*4882a593Smuzhiyun 	if (lpid != old_lpid)
78*4882a593Smuzhiyun 		mtspr(SPRN_LPID, old_lpid);
79*4882a593Smuzhiyun 	isync();
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun 	preempt_enable();
82*4882a593Smuzhiyun 
83*4882a593Smuzhiyun 	return ret;
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
86*4882a593Smuzhiyun 
kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu * vcpu,gva_t eaddr,void * to,void * from,unsigned long n)87*4882a593Smuzhiyun static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
88*4882a593Smuzhiyun 					  void *to, void *from, unsigned long n)
89*4882a593Smuzhiyun {
90*4882a593Smuzhiyun 	int lpid = vcpu->kvm->arch.lpid;
91*4882a593Smuzhiyun 	int pid = vcpu->arch.pid;
92*4882a593Smuzhiyun 
93*4882a593Smuzhiyun 	/* This would cause a data segment intr so don't allow the access */
94*4882a593Smuzhiyun 	if (eaddr & (0x3FFUL << 52))
95*4882a593Smuzhiyun 		return -EINVAL;
96*4882a593Smuzhiyun 
97*4882a593Smuzhiyun 	/* Should we be using the nested lpid */
98*4882a593Smuzhiyun 	if (vcpu->arch.nested)
99*4882a593Smuzhiyun 		lpid = vcpu->arch.nested->shadow_lpid;
100*4882a593Smuzhiyun 
101*4882a593Smuzhiyun 	/* If accessing quadrant 3 then pid is expected to be 0 */
102*4882a593Smuzhiyun 	if (((eaddr >> 62) & 0x3) == 0x3)
103*4882a593Smuzhiyun 		pid = 0;
104*4882a593Smuzhiyun 
105*4882a593Smuzhiyun 	eaddr &= ~(0xFFFUL << 52);
106*4882a593Smuzhiyun 
107*4882a593Smuzhiyun 	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun 
kvmhv_copy_from_guest_radix(struct kvm_vcpu * vcpu,gva_t eaddr,void * to,unsigned long n)110*4882a593Smuzhiyun long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
111*4882a593Smuzhiyun 				 unsigned long n)
112*4882a593Smuzhiyun {
113*4882a593Smuzhiyun 	long ret;
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun 	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
116*4882a593Smuzhiyun 	if (ret > 0)
117*4882a593Smuzhiyun 		memset(to + (n - ret), 0, ret);
118*4882a593Smuzhiyun 
119*4882a593Smuzhiyun 	return ret;
120*4882a593Smuzhiyun }
121*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
122*4882a593Smuzhiyun 
kvmhv_copy_to_guest_radix(struct kvm_vcpu * vcpu,gva_t eaddr,void * from,unsigned long n)123*4882a593Smuzhiyun long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
124*4882a593Smuzhiyun 			       unsigned long n)
125*4882a593Smuzhiyun {
126*4882a593Smuzhiyun 	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
127*4882a593Smuzhiyun }
128*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
129*4882a593Smuzhiyun 
kvmppc_mmu_walk_radix_tree(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,u64 root,u64 * pte_ret_p)130*4882a593Smuzhiyun int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
131*4882a593Smuzhiyun 			       struct kvmppc_pte *gpte, u64 root,
132*4882a593Smuzhiyun 			       u64 *pte_ret_p)
133*4882a593Smuzhiyun {
134*4882a593Smuzhiyun 	struct kvm *kvm = vcpu->kvm;
135*4882a593Smuzhiyun 	int ret, level, ps;
136*4882a593Smuzhiyun 	unsigned long rts, bits, offset, index;
137*4882a593Smuzhiyun 	u64 pte, base, gpa;
138*4882a593Smuzhiyun 	__be64 rpte;
139*4882a593Smuzhiyun 
140*4882a593Smuzhiyun 	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
141*4882a593Smuzhiyun 		((root & RTS2_MASK) >> RTS2_SHIFT);
142*4882a593Smuzhiyun 	bits = root & RPDS_MASK;
143*4882a593Smuzhiyun 	base = root & RPDB_MASK;
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun 	offset = rts + 31;
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun 	/* Current implementations only support 52-bit space */
148*4882a593Smuzhiyun 	if (offset != 52)
149*4882a593Smuzhiyun 		return -EINVAL;
150*4882a593Smuzhiyun 
151*4882a593Smuzhiyun 	/* Walk each level of the radix tree */
152*4882a593Smuzhiyun 	for (level = 3; level >= 0; --level) {
153*4882a593Smuzhiyun 		u64 addr;
154*4882a593Smuzhiyun 		/* Check a valid size */
155*4882a593Smuzhiyun 		if (level && bits != p9_supported_radix_bits[level])
156*4882a593Smuzhiyun 			return -EINVAL;
157*4882a593Smuzhiyun 		if (level == 0 && !(bits == 5 || bits == 9))
158*4882a593Smuzhiyun 			return -EINVAL;
159*4882a593Smuzhiyun 		offset -= bits;
160*4882a593Smuzhiyun 		index = (eaddr >> offset) & ((1UL << bits) - 1);
161*4882a593Smuzhiyun 		/* Check that low bits of page table base are zero */
162*4882a593Smuzhiyun 		if (base & ((1UL << (bits + 3)) - 1))
163*4882a593Smuzhiyun 			return -EINVAL;
164*4882a593Smuzhiyun 		/* Read the entry from guest memory */
165*4882a593Smuzhiyun 		addr = base + (index * sizeof(rpte));
166*4882a593Smuzhiyun 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
167*4882a593Smuzhiyun 		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
168*4882a593Smuzhiyun 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
169*4882a593Smuzhiyun 		if (ret) {
170*4882a593Smuzhiyun 			if (pte_ret_p)
171*4882a593Smuzhiyun 				*pte_ret_p = addr;
172*4882a593Smuzhiyun 			return ret;
173*4882a593Smuzhiyun 		}
174*4882a593Smuzhiyun 		pte = __be64_to_cpu(rpte);
175*4882a593Smuzhiyun 		if (!(pte & _PAGE_PRESENT))
176*4882a593Smuzhiyun 			return -ENOENT;
177*4882a593Smuzhiyun 		/* Check if a leaf entry */
178*4882a593Smuzhiyun 		if (pte & _PAGE_PTE)
179*4882a593Smuzhiyun 			break;
180*4882a593Smuzhiyun 		/* Get ready to walk the next level */
181*4882a593Smuzhiyun 		base = pte & RPDB_MASK;
182*4882a593Smuzhiyun 		bits = pte & RPDS_MASK;
183*4882a593Smuzhiyun 	}
184*4882a593Smuzhiyun 
185*4882a593Smuzhiyun 	/* Need a leaf at lowest level; 512GB pages not supported */
186*4882a593Smuzhiyun 	if (level < 0 || level == 3)
187*4882a593Smuzhiyun 		return -EINVAL;
188*4882a593Smuzhiyun 
189*4882a593Smuzhiyun 	/* We found a valid leaf PTE */
190*4882a593Smuzhiyun 	/* Offset is now log base 2 of the page size */
191*4882a593Smuzhiyun 	gpa = pte & 0x01fffffffffff000ul;
192*4882a593Smuzhiyun 	if (gpa & ((1ul << offset) - 1))
193*4882a593Smuzhiyun 		return -EINVAL;
194*4882a593Smuzhiyun 	gpa |= eaddr & ((1ul << offset) - 1);
195*4882a593Smuzhiyun 	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
196*4882a593Smuzhiyun 		if (offset == mmu_psize_defs[ps].shift)
197*4882a593Smuzhiyun 			break;
198*4882a593Smuzhiyun 	gpte->page_size = ps;
199*4882a593Smuzhiyun 	gpte->page_shift = offset;
200*4882a593Smuzhiyun 
201*4882a593Smuzhiyun 	gpte->eaddr = eaddr;
202*4882a593Smuzhiyun 	gpte->raddr = gpa;
203*4882a593Smuzhiyun 
204*4882a593Smuzhiyun 	/* Work out permissions */
205*4882a593Smuzhiyun 	gpte->may_read = !!(pte & _PAGE_READ);
206*4882a593Smuzhiyun 	gpte->may_write = !!(pte & _PAGE_WRITE);
207*4882a593Smuzhiyun 	gpte->may_execute = !!(pte & _PAGE_EXEC);
208*4882a593Smuzhiyun 
209*4882a593Smuzhiyun 	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
210*4882a593Smuzhiyun 
211*4882a593Smuzhiyun 	if (pte_ret_p)
212*4882a593Smuzhiyun 		*pte_ret_p = pte;
213*4882a593Smuzhiyun 
214*4882a593Smuzhiyun 	return 0;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun 
217*4882a593Smuzhiyun /*
218*4882a593Smuzhiyun  * Used to walk a partition or process table radix tree in guest memory
219*4882a593Smuzhiyun  * Note: We exploit the fact that a partition table and a process
220*4882a593Smuzhiyun  * table have the same layout, a partition-scoped page table and a
221*4882a593Smuzhiyun  * process-scoped page table have the same layout, and the 2nd
222*4882a593Smuzhiyun  * doubleword of a partition table entry has the same layout as
223*4882a593Smuzhiyun  * the PTCR register.
224*4882a593Smuzhiyun  */
kvmppc_mmu_radix_translate_table(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,u64 table,int table_index,u64 * pte_ret_p)225*4882a593Smuzhiyun int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
226*4882a593Smuzhiyun 				     struct kvmppc_pte *gpte, u64 table,
227*4882a593Smuzhiyun 				     int table_index, u64 *pte_ret_p)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun 	struct kvm *kvm = vcpu->kvm;
230*4882a593Smuzhiyun 	int ret;
231*4882a593Smuzhiyun 	unsigned long size, ptbl, root;
232*4882a593Smuzhiyun 	struct prtb_entry entry;
233*4882a593Smuzhiyun 
234*4882a593Smuzhiyun 	if ((table & PRTS_MASK) > 24)
235*4882a593Smuzhiyun 		return -EINVAL;
236*4882a593Smuzhiyun 	size = 1ul << ((table & PRTS_MASK) + 12);
237*4882a593Smuzhiyun 
238*4882a593Smuzhiyun 	/* Is the table big enough to contain this entry? */
239*4882a593Smuzhiyun 	if ((table_index * sizeof(entry)) >= size)
240*4882a593Smuzhiyun 		return -EINVAL;
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun 	/* Read the table to find the root of the radix tree */
243*4882a593Smuzhiyun 	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
244*4882a593Smuzhiyun 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
245*4882a593Smuzhiyun 	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
246*4882a593Smuzhiyun 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
247*4882a593Smuzhiyun 	if (ret)
248*4882a593Smuzhiyun 		return ret;
249*4882a593Smuzhiyun 
250*4882a593Smuzhiyun 	/* Root is stored in the first double word */
251*4882a593Smuzhiyun 	root = be64_to_cpu(entry.prtb0);
252*4882a593Smuzhiyun 
253*4882a593Smuzhiyun 	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun 
kvmppc_mmu_radix_xlate(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,bool data,bool iswrite)256*4882a593Smuzhiyun int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
257*4882a593Smuzhiyun 			   struct kvmppc_pte *gpte, bool data, bool iswrite)
258*4882a593Smuzhiyun {
259*4882a593Smuzhiyun 	u32 pid;
260*4882a593Smuzhiyun 	u64 pte;
261*4882a593Smuzhiyun 	int ret;
262*4882a593Smuzhiyun 
263*4882a593Smuzhiyun 	/* Work out effective PID */
264*4882a593Smuzhiyun 	switch (eaddr >> 62) {
265*4882a593Smuzhiyun 	case 0:
266*4882a593Smuzhiyun 		pid = vcpu->arch.pid;
267*4882a593Smuzhiyun 		break;
268*4882a593Smuzhiyun 	case 3:
269*4882a593Smuzhiyun 		pid = 0;
270*4882a593Smuzhiyun 		break;
271*4882a593Smuzhiyun 	default:
272*4882a593Smuzhiyun 		return -EINVAL;
273*4882a593Smuzhiyun 	}
274*4882a593Smuzhiyun 
275*4882a593Smuzhiyun 	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
276*4882a593Smuzhiyun 				vcpu->kvm->arch.process_table, pid, &pte);
277*4882a593Smuzhiyun 	if (ret)
278*4882a593Smuzhiyun 		return ret;
279*4882a593Smuzhiyun 
280*4882a593Smuzhiyun 	/* Check privilege (applies only to process scoped translations) */
281*4882a593Smuzhiyun 	if (kvmppc_get_msr(vcpu) & MSR_PR) {
282*4882a593Smuzhiyun 		if (pte & _PAGE_PRIVILEGED) {
283*4882a593Smuzhiyun 			gpte->may_read = 0;
284*4882a593Smuzhiyun 			gpte->may_write = 0;
285*4882a593Smuzhiyun 			gpte->may_execute = 0;
286*4882a593Smuzhiyun 		}
287*4882a593Smuzhiyun 	} else {
288*4882a593Smuzhiyun 		if (!(pte & _PAGE_PRIVILEGED)) {
289*4882a593Smuzhiyun 			/* Check AMR/IAMR to see if strict mode is in force */
290*4882a593Smuzhiyun 			if (vcpu->arch.amr & (1ul << 62))
291*4882a593Smuzhiyun 				gpte->may_read = 0;
292*4882a593Smuzhiyun 			if (vcpu->arch.amr & (1ul << 63))
293*4882a593Smuzhiyun 				gpte->may_write = 0;
294*4882a593Smuzhiyun 			if (vcpu->arch.iamr & (1ul << 62))
295*4882a593Smuzhiyun 				gpte->may_execute = 0;
296*4882a593Smuzhiyun 		}
297*4882a593Smuzhiyun 	}
298*4882a593Smuzhiyun 
299*4882a593Smuzhiyun 	return 0;
300*4882a593Smuzhiyun }
301*4882a593Smuzhiyun 
kvmppc_radix_tlbie_page(struct kvm * kvm,unsigned long addr,unsigned int pshift,unsigned int lpid)302*4882a593Smuzhiyun void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
303*4882a593Smuzhiyun 			     unsigned int pshift, unsigned int lpid)
304*4882a593Smuzhiyun {
305*4882a593Smuzhiyun 	unsigned long psize = PAGE_SIZE;
306*4882a593Smuzhiyun 	int psi;
307*4882a593Smuzhiyun 	long rc;
308*4882a593Smuzhiyun 	unsigned long rb;
309*4882a593Smuzhiyun 
310*4882a593Smuzhiyun 	if (pshift)
311*4882a593Smuzhiyun 		psize = 1UL << pshift;
312*4882a593Smuzhiyun 	else
313*4882a593Smuzhiyun 		pshift = PAGE_SHIFT;
314*4882a593Smuzhiyun 
315*4882a593Smuzhiyun 	addr &= ~(psize - 1);
316*4882a593Smuzhiyun 
317*4882a593Smuzhiyun 	if (!kvmhv_on_pseries()) {
318*4882a593Smuzhiyun 		radix__flush_tlb_lpid_page(lpid, addr, psize);
319*4882a593Smuzhiyun 		return;
320*4882a593Smuzhiyun 	}
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun 	psi = shift_to_mmu_psize(pshift);
323*4882a593Smuzhiyun 	rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
324*4882a593Smuzhiyun 	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
325*4882a593Smuzhiyun 				lpid, rb);
326*4882a593Smuzhiyun 	if (rc)
327*4882a593Smuzhiyun 		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
328*4882a593Smuzhiyun }
329*4882a593Smuzhiyun 
kvmppc_radix_flush_pwc(struct kvm * kvm,unsigned int lpid)330*4882a593Smuzhiyun static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
331*4882a593Smuzhiyun {
332*4882a593Smuzhiyun 	long rc;
333*4882a593Smuzhiyun 
334*4882a593Smuzhiyun 	if (!kvmhv_on_pseries()) {
335*4882a593Smuzhiyun 		radix__flush_pwc_lpid(lpid);
336*4882a593Smuzhiyun 		return;
337*4882a593Smuzhiyun 	}
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun 	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
340*4882a593Smuzhiyun 				lpid, TLBIEL_INVAL_SET_LPID);
341*4882a593Smuzhiyun 	if (rc)
342*4882a593Smuzhiyun 		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
343*4882a593Smuzhiyun }
344*4882a593Smuzhiyun 
kvmppc_radix_update_pte(struct kvm * kvm,pte_t * ptep,unsigned long clr,unsigned long set,unsigned long addr,unsigned int shift)345*4882a593Smuzhiyun static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
346*4882a593Smuzhiyun 				      unsigned long clr, unsigned long set,
347*4882a593Smuzhiyun 				      unsigned long addr, unsigned int shift)
348*4882a593Smuzhiyun {
349*4882a593Smuzhiyun 	return __radix_pte_update(ptep, clr, set);
350*4882a593Smuzhiyun }
351*4882a593Smuzhiyun 
kvmppc_radix_set_pte_at(struct kvm * kvm,unsigned long addr,pte_t * ptep,pte_t pte)352*4882a593Smuzhiyun static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
353*4882a593Smuzhiyun 			     pte_t *ptep, pte_t pte)
354*4882a593Smuzhiyun {
355*4882a593Smuzhiyun 	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun 
358*4882a593Smuzhiyun static struct kmem_cache *kvm_pte_cache;
359*4882a593Smuzhiyun static struct kmem_cache *kvm_pmd_cache;
360*4882a593Smuzhiyun 
kvmppc_pte_alloc(void)361*4882a593Smuzhiyun static pte_t *kvmppc_pte_alloc(void)
362*4882a593Smuzhiyun {
363*4882a593Smuzhiyun 	pte_t *pte;
364*4882a593Smuzhiyun 
365*4882a593Smuzhiyun 	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
366*4882a593Smuzhiyun 	/* pmd_populate() will only reference _pa(pte). */
367*4882a593Smuzhiyun 	kmemleak_ignore(pte);
368*4882a593Smuzhiyun 
369*4882a593Smuzhiyun 	return pte;
370*4882a593Smuzhiyun }
371*4882a593Smuzhiyun 
kvmppc_pte_free(pte_t * ptep)372*4882a593Smuzhiyun static void kvmppc_pte_free(pte_t *ptep)
373*4882a593Smuzhiyun {
374*4882a593Smuzhiyun 	kmem_cache_free(kvm_pte_cache, ptep);
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun 
kvmppc_pmd_alloc(void)377*4882a593Smuzhiyun static pmd_t *kvmppc_pmd_alloc(void)
378*4882a593Smuzhiyun {
379*4882a593Smuzhiyun 	pmd_t *pmd;
380*4882a593Smuzhiyun 
381*4882a593Smuzhiyun 	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
382*4882a593Smuzhiyun 	/* pud_populate() will only reference _pa(pmd). */
383*4882a593Smuzhiyun 	kmemleak_ignore(pmd);
384*4882a593Smuzhiyun 
385*4882a593Smuzhiyun 	return pmd;
386*4882a593Smuzhiyun }
387*4882a593Smuzhiyun 
kvmppc_pmd_free(pmd_t * pmdp)388*4882a593Smuzhiyun static void kvmppc_pmd_free(pmd_t *pmdp)
389*4882a593Smuzhiyun {
390*4882a593Smuzhiyun 	kmem_cache_free(kvm_pmd_cache, pmdp);
391*4882a593Smuzhiyun }
392*4882a593Smuzhiyun 
393*4882a593Smuzhiyun /* Called with kvm->mmu_lock held */
kvmppc_unmap_pte(struct kvm * kvm,pte_t * pte,unsigned long gpa,unsigned int shift,const struct kvm_memory_slot * memslot,unsigned int lpid)394*4882a593Smuzhiyun void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
395*4882a593Smuzhiyun 		      unsigned int shift,
396*4882a593Smuzhiyun 		      const struct kvm_memory_slot *memslot,
397*4882a593Smuzhiyun 		      unsigned int lpid)
398*4882a593Smuzhiyun 
399*4882a593Smuzhiyun {
400*4882a593Smuzhiyun 	unsigned long old;
401*4882a593Smuzhiyun 	unsigned long gfn = gpa >> PAGE_SHIFT;
402*4882a593Smuzhiyun 	unsigned long page_size = PAGE_SIZE;
403*4882a593Smuzhiyun 	unsigned long hpa;
404*4882a593Smuzhiyun 
405*4882a593Smuzhiyun 	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
406*4882a593Smuzhiyun 	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
407*4882a593Smuzhiyun 
408*4882a593Smuzhiyun 	/* The following only applies to L1 entries */
409*4882a593Smuzhiyun 	if (lpid != kvm->arch.lpid)
410*4882a593Smuzhiyun 		return;
411*4882a593Smuzhiyun 
412*4882a593Smuzhiyun 	if (!memslot) {
413*4882a593Smuzhiyun 		memslot = gfn_to_memslot(kvm, gfn);
414*4882a593Smuzhiyun 		if (!memslot)
415*4882a593Smuzhiyun 			return;
416*4882a593Smuzhiyun 	}
417*4882a593Smuzhiyun 	if (shift) { /* 1GB or 2MB page */
418*4882a593Smuzhiyun 		page_size = 1ul << shift;
419*4882a593Smuzhiyun 		if (shift == PMD_SHIFT)
420*4882a593Smuzhiyun 			kvm->stat.num_2M_pages--;
421*4882a593Smuzhiyun 		else if (shift == PUD_SHIFT)
422*4882a593Smuzhiyun 			kvm->stat.num_1G_pages--;
423*4882a593Smuzhiyun 	}
424*4882a593Smuzhiyun 
425*4882a593Smuzhiyun 	gpa &= ~(page_size - 1);
426*4882a593Smuzhiyun 	hpa = old & PTE_RPN_MASK;
427*4882a593Smuzhiyun 	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
428*4882a593Smuzhiyun 
429*4882a593Smuzhiyun 	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
430*4882a593Smuzhiyun 		kvmppc_update_dirty_map(memslot, gfn, page_size);
431*4882a593Smuzhiyun }
432*4882a593Smuzhiyun 
433*4882a593Smuzhiyun /*
434*4882a593Smuzhiyun  * kvmppc_free_p?d are used to free existing page tables, and recursively
435*4882a593Smuzhiyun  * descend and clear and free children.
436*4882a593Smuzhiyun  * Callers are responsible for flushing the PWC.
437*4882a593Smuzhiyun  *
438*4882a593Smuzhiyun  * When page tables are being unmapped/freed as part of page fault path
439*4882a593Smuzhiyun  * (full == false), valid ptes are generally not expected; however, there
440*4882a593Smuzhiyun  * is one situation where they arise, which is when dirty page logging is
441*4882a593Smuzhiyun  * turned off for a memslot while the VM is running.  The new memslot
442*4882a593Smuzhiyun  * becomes visible to page faults before the memslot commit function
443*4882a593Smuzhiyun  * gets to flush the memslot, which can lead to a 2MB page mapping being
444*4882a593Smuzhiyun  * installed for a guest physical address where there are already 64kB
445*4882a593Smuzhiyun  * (or 4kB) mappings (of sub-pages of the same 2MB page).
446*4882a593Smuzhiyun  */
kvmppc_unmap_free_pte(struct kvm * kvm,pte_t * pte,bool full,unsigned int lpid)447*4882a593Smuzhiyun static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
448*4882a593Smuzhiyun 				  unsigned int lpid)
449*4882a593Smuzhiyun {
450*4882a593Smuzhiyun 	if (full) {
451*4882a593Smuzhiyun 		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
452*4882a593Smuzhiyun 	} else {
453*4882a593Smuzhiyun 		pte_t *p = pte;
454*4882a593Smuzhiyun 		unsigned long it;
455*4882a593Smuzhiyun 
456*4882a593Smuzhiyun 		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
457*4882a593Smuzhiyun 			if (pte_val(*p) == 0)
458*4882a593Smuzhiyun 				continue;
459*4882a593Smuzhiyun 			kvmppc_unmap_pte(kvm, p,
460*4882a593Smuzhiyun 					 pte_pfn(*p) << PAGE_SHIFT,
461*4882a593Smuzhiyun 					 PAGE_SHIFT, NULL, lpid);
462*4882a593Smuzhiyun 		}
463*4882a593Smuzhiyun 	}
464*4882a593Smuzhiyun 
465*4882a593Smuzhiyun 	kvmppc_pte_free(pte);
466*4882a593Smuzhiyun }
467*4882a593Smuzhiyun 
kvmppc_unmap_free_pmd(struct kvm * kvm,pmd_t * pmd,bool full,unsigned int lpid)468*4882a593Smuzhiyun static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
469*4882a593Smuzhiyun 				  unsigned int lpid)
470*4882a593Smuzhiyun {
471*4882a593Smuzhiyun 	unsigned long im;
472*4882a593Smuzhiyun 	pmd_t *p = pmd;
473*4882a593Smuzhiyun 
474*4882a593Smuzhiyun 	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
475*4882a593Smuzhiyun 		if (!pmd_present(*p))
476*4882a593Smuzhiyun 			continue;
477*4882a593Smuzhiyun 		if (pmd_is_leaf(*p)) {
478*4882a593Smuzhiyun 			if (full) {
479*4882a593Smuzhiyun 				pmd_clear(p);
480*4882a593Smuzhiyun 			} else {
481*4882a593Smuzhiyun 				WARN_ON_ONCE(1);
482*4882a593Smuzhiyun 				kvmppc_unmap_pte(kvm, (pte_t *)p,
483*4882a593Smuzhiyun 					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
484*4882a593Smuzhiyun 					 PMD_SHIFT, NULL, lpid);
485*4882a593Smuzhiyun 			}
486*4882a593Smuzhiyun 		} else {
487*4882a593Smuzhiyun 			pte_t *pte;
488*4882a593Smuzhiyun 
489*4882a593Smuzhiyun 			pte = pte_offset_map(p, 0);
490*4882a593Smuzhiyun 			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
491*4882a593Smuzhiyun 			pmd_clear(p);
492*4882a593Smuzhiyun 		}
493*4882a593Smuzhiyun 	}
494*4882a593Smuzhiyun 	kvmppc_pmd_free(pmd);
495*4882a593Smuzhiyun }
496*4882a593Smuzhiyun 
kvmppc_unmap_free_pud(struct kvm * kvm,pud_t * pud,unsigned int lpid)497*4882a593Smuzhiyun static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
498*4882a593Smuzhiyun 				  unsigned int lpid)
499*4882a593Smuzhiyun {
500*4882a593Smuzhiyun 	unsigned long iu;
501*4882a593Smuzhiyun 	pud_t *p = pud;
502*4882a593Smuzhiyun 
503*4882a593Smuzhiyun 	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
504*4882a593Smuzhiyun 		if (!pud_present(*p))
505*4882a593Smuzhiyun 			continue;
506*4882a593Smuzhiyun 		if (pud_is_leaf(*p)) {
507*4882a593Smuzhiyun 			pud_clear(p);
508*4882a593Smuzhiyun 		} else {
509*4882a593Smuzhiyun 			pmd_t *pmd;
510*4882a593Smuzhiyun 
511*4882a593Smuzhiyun 			pmd = pmd_offset(p, 0);
512*4882a593Smuzhiyun 			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
513*4882a593Smuzhiyun 			pud_clear(p);
514*4882a593Smuzhiyun 		}
515*4882a593Smuzhiyun 	}
516*4882a593Smuzhiyun 	pud_free(kvm->mm, pud);
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun 
kvmppc_free_pgtable_radix(struct kvm * kvm,pgd_t * pgd,unsigned int lpid)519*4882a593Smuzhiyun void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
520*4882a593Smuzhiyun {
521*4882a593Smuzhiyun 	unsigned long ig;
522*4882a593Smuzhiyun 
523*4882a593Smuzhiyun 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
524*4882a593Smuzhiyun 		p4d_t *p4d = p4d_offset(pgd, 0);
525*4882a593Smuzhiyun 		pud_t *pud;
526*4882a593Smuzhiyun 
527*4882a593Smuzhiyun 		if (!p4d_present(*p4d))
528*4882a593Smuzhiyun 			continue;
529*4882a593Smuzhiyun 		pud = pud_offset(p4d, 0);
530*4882a593Smuzhiyun 		kvmppc_unmap_free_pud(kvm, pud, lpid);
531*4882a593Smuzhiyun 		p4d_clear(p4d);
532*4882a593Smuzhiyun 	}
533*4882a593Smuzhiyun }
534*4882a593Smuzhiyun 
kvmppc_free_radix(struct kvm * kvm)535*4882a593Smuzhiyun void kvmppc_free_radix(struct kvm *kvm)
536*4882a593Smuzhiyun {
537*4882a593Smuzhiyun 	if (kvm->arch.pgtable) {
538*4882a593Smuzhiyun 		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
539*4882a593Smuzhiyun 					  kvm->arch.lpid);
540*4882a593Smuzhiyun 		pgd_free(kvm->mm, kvm->arch.pgtable);
541*4882a593Smuzhiyun 		kvm->arch.pgtable = NULL;
542*4882a593Smuzhiyun 	}
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun 
kvmppc_unmap_free_pmd_entry_table(struct kvm * kvm,pmd_t * pmd,unsigned long gpa,unsigned int lpid)545*4882a593Smuzhiyun static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
546*4882a593Smuzhiyun 					unsigned long gpa, unsigned int lpid)
547*4882a593Smuzhiyun {
548*4882a593Smuzhiyun 	pte_t *pte = pte_offset_kernel(pmd, 0);
549*4882a593Smuzhiyun 
550*4882a593Smuzhiyun 	/*
551*4882a593Smuzhiyun 	 * Clearing the pmd entry then flushing the PWC ensures that the pte
552*4882a593Smuzhiyun 	 * page no longer be cached by the MMU, so can be freed without
553*4882a593Smuzhiyun 	 * flushing the PWC again.
554*4882a593Smuzhiyun 	 */
555*4882a593Smuzhiyun 	pmd_clear(pmd);
556*4882a593Smuzhiyun 	kvmppc_radix_flush_pwc(kvm, lpid);
557*4882a593Smuzhiyun 
558*4882a593Smuzhiyun 	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
559*4882a593Smuzhiyun }
560*4882a593Smuzhiyun 
kvmppc_unmap_free_pud_entry_table(struct kvm * kvm,pud_t * pud,unsigned long gpa,unsigned int lpid)561*4882a593Smuzhiyun static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
562*4882a593Smuzhiyun 					unsigned long gpa, unsigned int lpid)
563*4882a593Smuzhiyun {
564*4882a593Smuzhiyun 	pmd_t *pmd = pmd_offset(pud, 0);
565*4882a593Smuzhiyun 
566*4882a593Smuzhiyun 	/*
567*4882a593Smuzhiyun 	 * Clearing the pud entry then flushing the PWC ensures that the pmd
568*4882a593Smuzhiyun 	 * page and any children pte pages will no longer be cached by the MMU,
569*4882a593Smuzhiyun 	 * so can be freed without flushing the PWC again.
570*4882a593Smuzhiyun 	 */
571*4882a593Smuzhiyun 	pud_clear(pud);
572*4882a593Smuzhiyun 	kvmppc_radix_flush_pwc(kvm, lpid);
573*4882a593Smuzhiyun 
574*4882a593Smuzhiyun 	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
575*4882a593Smuzhiyun }
576*4882a593Smuzhiyun 
577*4882a593Smuzhiyun /*
578*4882a593Smuzhiyun  * There are a number of bits which may differ between different faults to
579*4882a593Smuzhiyun  * the same partition scope entry. RC bits, in the course of cleaning and
580*4882a593Smuzhiyun  * aging. And the write bit can change, either the access could have been
581*4882a593Smuzhiyun  * upgraded, or a read fault could happen concurrently with a write fault
582*4882a593Smuzhiyun  * that sets those bits first.
583*4882a593Smuzhiyun  */
584*4882a593Smuzhiyun #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
585*4882a593Smuzhiyun 
kvmppc_create_pte(struct kvm * kvm,pgd_t * pgtable,pte_t pte,unsigned long gpa,unsigned int level,unsigned long mmu_seq,unsigned int lpid,unsigned long * rmapp,struct rmap_nested ** n_rmap)586*4882a593Smuzhiyun int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
587*4882a593Smuzhiyun 		      unsigned long gpa, unsigned int level,
588*4882a593Smuzhiyun 		      unsigned long mmu_seq, unsigned int lpid,
589*4882a593Smuzhiyun 		      unsigned long *rmapp, struct rmap_nested **n_rmap)
590*4882a593Smuzhiyun {
591*4882a593Smuzhiyun 	pgd_t *pgd;
592*4882a593Smuzhiyun 	p4d_t *p4d;
593*4882a593Smuzhiyun 	pud_t *pud, *new_pud = NULL;
594*4882a593Smuzhiyun 	pmd_t *pmd, *new_pmd = NULL;
595*4882a593Smuzhiyun 	pte_t *ptep, *new_ptep = NULL;
596*4882a593Smuzhiyun 	int ret;
597*4882a593Smuzhiyun 
598*4882a593Smuzhiyun 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
599*4882a593Smuzhiyun 	pgd = pgtable + pgd_index(gpa);
600*4882a593Smuzhiyun 	p4d = p4d_offset(pgd, gpa);
601*4882a593Smuzhiyun 
602*4882a593Smuzhiyun 	pud = NULL;
603*4882a593Smuzhiyun 	if (p4d_present(*p4d))
604*4882a593Smuzhiyun 		pud = pud_offset(p4d, gpa);
605*4882a593Smuzhiyun 	else
606*4882a593Smuzhiyun 		new_pud = pud_alloc_one(kvm->mm, gpa);
607*4882a593Smuzhiyun 
608*4882a593Smuzhiyun 	pmd = NULL;
609*4882a593Smuzhiyun 	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
610*4882a593Smuzhiyun 		pmd = pmd_offset(pud, gpa);
611*4882a593Smuzhiyun 	else if (level <= 1)
612*4882a593Smuzhiyun 		new_pmd = kvmppc_pmd_alloc();
613*4882a593Smuzhiyun 
614*4882a593Smuzhiyun 	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
615*4882a593Smuzhiyun 		new_ptep = kvmppc_pte_alloc();
616*4882a593Smuzhiyun 
617*4882a593Smuzhiyun 	/* Check if we might have been invalidated; let the guest retry if so */
618*4882a593Smuzhiyun 	spin_lock(&kvm->mmu_lock);
619*4882a593Smuzhiyun 	ret = -EAGAIN;
620*4882a593Smuzhiyun 	if (mmu_notifier_retry(kvm, mmu_seq))
621*4882a593Smuzhiyun 		goto out_unlock;
622*4882a593Smuzhiyun 
623*4882a593Smuzhiyun 	/* Now traverse again under the lock and change the tree */
624*4882a593Smuzhiyun 	ret = -ENOMEM;
625*4882a593Smuzhiyun 	if (p4d_none(*p4d)) {
626*4882a593Smuzhiyun 		if (!new_pud)
627*4882a593Smuzhiyun 			goto out_unlock;
628*4882a593Smuzhiyun 		p4d_populate(kvm->mm, p4d, new_pud);
629*4882a593Smuzhiyun 		new_pud = NULL;
630*4882a593Smuzhiyun 	}
631*4882a593Smuzhiyun 	pud = pud_offset(p4d, gpa);
632*4882a593Smuzhiyun 	if (pud_is_leaf(*pud)) {
633*4882a593Smuzhiyun 		unsigned long hgpa = gpa & PUD_MASK;
634*4882a593Smuzhiyun 
635*4882a593Smuzhiyun 		/* Check if we raced and someone else has set the same thing */
636*4882a593Smuzhiyun 		if (level == 2) {
637*4882a593Smuzhiyun 			if (pud_raw(*pud) == pte_raw(pte)) {
638*4882a593Smuzhiyun 				ret = 0;
639*4882a593Smuzhiyun 				goto out_unlock;
640*4882a593Smuzhiyun 			}
641*4882a593Smuzhiyun 			/* Valid 1GB page here already, add our extra bits */
642*4882a593Smuzhiyun 			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
643*4882a593Smuzhiyun 							PTE_BITS_MUST_MATCH);
644*4882a593Smuzhiyun 			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
645*4882a593Smuzhiyun 					      0, pte_val(pte), hgpa, PUD_SHIFT);
646*4882a593Smuzhiyun 			ret = 0;
647*4882a593Smuzhiyun 			goto out_unlock;
648*4882a593Smuzhiyun 		}
649*4882a593Smuzhiyun 		/*
650*4882a593Smuzhiyun 		 * If we raced with another CPU which has just put
651*4882a593Smuzhiyun 		 * a 1GB pte in after we saw a pmd page, try again.
652*4882a593Smuzhiyun 		 */
653*4882a593Smuzhiyun 		if (!new_pmd) {
654*4882a593Smuzhiyun 			ret = -EAGAIN;
655*4882a593Smuzhiyun 			goto out_unlock;
656*4882a593Smuzhiyun 		}
657*4882a593Smuzhiyun 		/* Valid 1GB page here already, remove it */
658*4882a593Smuzhiyun 		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
659*4882a593Smuzhiyun 				 lpid);
660*4882a593Smuzhiyun 	}
661*4882a593Smuzhiyun 	if (level == 2) {
662*4882a593Smuzhiyun 		if (!pud_none(*pud)) {
663*4882a593Smuzhiyun 			/*
664*4882a593Smuzhiyun 			 * There's a page table page here, but we wanted to
665*4882a593Smuzhiyun 			 * install a large page, so remove and free the page
666*4882a593Smuzhiyun 			 * table page.
667*4882a593Smuzhiyun 			 */
668*4882a593Smuzhiyun 			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
669*4882a593Smuzhiyun 		}
670*4882a593Smuzhiyun 		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
671*4882a593Smuzhiyun 		if (rmapp && n_rmap)
672*4882a593Smuzhiyun 			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
673*4882a593Smuzhiyun 		ret = 0;
674*4882a593Smuzhiyun 		goto out_unlock;
675*4882a593Smuzhiyun 	}
676*4882a593Smuzhiyun 	if (pud_none(*pud)) {
677*4882a593Smuzhiyun 		if (!new_pmd)
678*4882a593Smuzhiyun 			goto out_unlock;
679*4882a593Smuzhiyun 		pud_populate(kvm->mm, pud, new_pmd);
680*4882a593Smuzhiyun 		new_pmd = NULL;
681*4882a593Smuzhiyun 	}
682*4882a593Smuzhiyun 	pmd = pmd_offset(pud, gpa);
683*4882a593Smuzhiyun 	if (pmd_is_leaf(*pmd)) {
684*4882a593Smuzhiyun 		unsigned long lgpa = gpa & PMD_MASK;
685*4882a593Smuzhiyun 
686*4882a593Smuzhiyun 		/* Check if we raced and someone else has set the same thing */
687*4882a593Smuzhiyun 		if (level == 1) {
688*4882a593Smuzhiyun 			if (pmd_raw(*pmd) == pte_raw(pte)) {
689*4882a593Smuzhiyun 				ret = 0;
690*4882a593Smuzhiyun 				goto out_unlock;
691*4882a593Smuzhiyun 			}
692*4882a593Smuzhiyun 			/* Valid 2MB page here already, add our extra bits */
693*4882a593Smuzhiyun 			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
694*4882a593Smuzhiyun 							PTE_BITS_MUST_MATCH);
695*4882a593Smuzhiyun 			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
696*4882a593Smuzhiyun 					0, pte_val(pte), lgpa, PMD_SHIFT);
697*4882a593Smuzhiyun 			ret = 0;
698*4882a593Smuzhiyun 			goto out_unlock;
699*4882a593Smuzhiyun 		}
700*4882a593Smuzhiyun 
701*4882a593Smuzhiyun 		/*
702*4882a593Smuzhiyun 		 * If we raced with another CPU which has just put
703*4882a593Smuzhiyun 		 * a 2MB pte in after we saw a pte page, try again.
704*4882a593Smuzhiyun 		 */
705*4882a593Smuzhiyun 		if (!new_ptep) {
706*4882a593Smuzhiyun 			ret = -EAGAIN;
707*4882a593Smuzhiyun 			goto out_unlock;
708*4882a593Smuzhiyun 		}
709*4882a593Smuzhiyun 		/* Valid 2MB page here already, remove it */
710*4882a593Smuzhiyun 		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
711*4882a593Smuzhiyun 				 lpid);
712*4882a593Smuzhiyun 	}
713*4882a593Smuzhiyun 	if (level == 1) {
714*4882a593Smuzhiyun 		if (!pmd_none(*pmd)) {
715*4882a593Smuzhiyun 			/*
716*4882a593Smuzhiyun 			 * There's a page table page here, but we wanted to
717*4882a593Smuzhiyun 			 * install a large page, so remove and free the page
718*4882a593Smuzhiyun 			 * table page.
719*4882a593Smuzhiyun 			 */
720*4882a593Smuzhiyun 			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
721*4882a593Smuzhiyun 		}
722*4882a593Smuzhiyun 		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
723*4882a593Smuzhiyun 		if (rmapp && n_rmap)
724*4882a593Smuzhiyun 			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
725*4882a593Smuzhiyun 		ret = 0;
726*4882a593Smuzhiyun 		goto out_unlock;
727*4882a593Smuzhiyun 	}
728*4882a593Smuzhiyun 	if (pmd_none(*pmd)) {
729*4882a593Smuzhiyun 		if (!new_ptep)
730*4882a593Smuzhiyun 			goto out_unlock;
731*4882a593Smuzhiyun 		pmd_populate(kvm->mm, pmd, new_ptep);
732*4882a593Smuzhiyun 		new_ptep = NULL;
733*4882a593Smuzhiyun 	}
734*4882a593Smuzhiyun 	ptep = pte_offset_kernel(pmd, gpa);
735*4882a593Smuzhiyun 	if (pte_present(*ptep)) {
736*4882a593Smuzhiyun 		/* Check if someone else set the same thing */
737*4882a593Smuzhiyun 		if (pte_raw(*ptep) == pte_raw(pte)) {
738*4882a593Smuzhiyun 			ret = 0;
739*4882a593Smuzhiyun 			goto out_unlock;
740*4882a593Smuzhiyun 		}
741*4882a593Smuzhiyun 		/* Valid page here already, add our extra bits */
742*4882a593Smuzhiyun 		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
743*4882a593Smuzhiyun 							PTE_BITS_MUST_MATCH);
744*4882a593Smuzhiyun 		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
745*4882a593Smuzhiyun 		ret = 0;
746*4882a593Smuzhiyun 		goto out_unlock;
747*4882a593Smuzhiyun 	}
748*4882a593Smuzhiyun 	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
749*4882a593Smuzhiyun 	if (rmapp && n_rmap)
750*4882a593Smuzhiyun 		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
751*4882a593Smuzhiyun 	ret = 0;
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun  out_unlock:
754*4882a593Smuzhiyun 	spin_unlock(&kvm->mmu_lock);
755*4882a593Smuzhiyun 	if (new_pud)
756*4882a593Smuzhiyun 		pud_free(kvm->mm, new_pud);
757*4882a593Smuzhiyun 	if (new_pmd)
758*4882a593Smuzhiyun 		kvmppc_pmd_free(new_pmd);
759*4882a593Smuzhiyun 	if (new_ptep)
760*4882a593Smuzhiyun 		kvmppc_pte_free(new_ptep);
761*4882a593Smuzhiyun 	return ret;
762*4882a593Smuzhiyun }
763*4882a593Smuzhiyun 
kvmppc_hv_handle_set_rc(struct kvm * kvm,bool nested,bool writing,unsigned long gpa,unsigned int lpid)764*4882a593Smuzhiyun bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
765*4882a593Smuzhiyun 			     unsigned long gpa, unsigned int lpid)
766*4882a593Smuzhiyun {
767*4882a593Smuzhiyun 	unsigned long pgflags;
768*4882a593Smuzhiyun 	unsigned int shift;
769*4882a593Smuzhiyun 	pte_t *ptep;
770*4882a593Smuzhiyun 
771*4882a593Smuzhiyun 	/*
772*4882a593Smuzhiyun 	 * Need to set an R or C bit in the 2nd-level tables;
773*4882a593Smuzhiyun 	 * since we are just helping out the hardware here,
774*4882a593Smuzhiyun 	 * it is sufficient to do what the hardware does.
775*4882a593Smuzhiyun 	 */
776*4882a593Smuzhiyun 	pgflags = _PAGE_ACCESSED;
777*4882a593Smuzhiyun 	if (writing)
778*4882a593Smuzhiyun 		pgflags |= _PAGE_DIRTY;
779*4882a593Smuzhiyun 
780*4882a593Smuzhiyun 	if (nested)
781*4882a593Smuzhiyun 		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
782*4882a593Smuzhiyun 	else
783*4882a593Smuzhiyun 		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
784*4882a593Smuzhiyun 
785*4882a593Smuzhiyun 	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
786*4882a593Smuzhiyun 		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
787*4882a593Smuzhiyun 		return true;
788*4882a593Smuzhiyun 	}
789*4882a593Smuzhiyun 	return false;
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun 
kvmppc_book3s_instantiate_page(struct kvm_vcpu * vcpu,unsigned long gpa,struct kvm_memory_slot * memslot,bool writing,bool kvm_ro,pte_t * inserted_pte,unsigned int * levelp)792*4882a593Smuzhiyun int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
793*4882a593Smuzhiyun 				   unsigned long gpa,
794*4882a593Smuzhiyun 				   struct kvm_memory_slot *memslot,
795*4882a593Smuzhiyun 				   bool writing, bool kvm_ro,
796*4882a593Smuzhiyun 				   pte_t *inserted_pte, unsigned int *levelp)
797*4882a593Smuzhiyun {
798*4882a593Smuzhiyun 	struct kvm *kvm = vcpu->kvm;
799*4882a593Smuzhiyun 	struct page *page = NULL;
800*4882a593Smuzhiyun 	unsigned long mmu_seq;
801*4882a593Smuzhiyun 	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
802*4882a593Smuzhiyun 	bool upgrade_write = false;
803*4882a593Smuzhiyun 	bool *upgrade_p = &upgrade_write;
804*4882a593Smuzhiyun 	pte_t pte, *ptep;
805*4882a593Smuzhiyun 	unsigned int shift, level;
806*4882a593Smuzhiyun 	int ret;
807*4882a593Smuzhiyun 	bool large_enable;
808*4882a593Smuzhiyun 
809*4882a593Smuzhiyun 	/* used to check for invalidations in progress */
810*4882a593Smuzhiyun 	mmu_seq = kvm->mmu_notifier_seq;
811*4882a593Smuzhiyun 	smp_rmb();
812*4882a593Smuzhiyun 
813*4882a593Smuzhiyun 	/*
814*4882a593Smuzhiyun 	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
815*4882a593Smuzhiyun 	 * do it with !atomic && !async, which is how we call it.
816*4882a593Smuzhiyun 	 * We always ask for write permission since the common case
817*4882a593Smuzhiyun 	 * is that the page is writable.
818*4882a593Smuzhiyun 	 */
819*4882a593Smuzhiyun 	hva = gfn_to_hva_memslot(memslot, gfn);
820*4882a593Smuzhiyun 	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
821*4882a593Smuzhiyun 		upgrade_write = true;
822*4882a593Smuzhiyun 	} else {
823*4882a593Smuzhiyun 		unsigned long pfn;
824*4882a593Smuzhiyun 
825*4882a593Smuzhiyun 		/* Call KVM generic code to do the slow-path check */
826*4882a593Smuzhiyun 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
827*4882a593Smuzhiyun 					   writing, upgrade_p);
828*4882a593Smuzhiyun 		if (is_error_noslot_pfn(pfn))
829*4882a593Smuzhiyun 			return -EFAULT;
830*4882a593Smuzhiyun 		page = NULL;
831*4882a593Smuzhiyun 		if (pfn_valid(pfn)) {
832*4882a593Smuzhiyun 			page = pfn_to_page(pfn);
833*4882a593Smuzhiyun 			if (PageReserved(page))
834*4882a593Smuzhiyun 				page = NULL;
835*4882a593Smuzhiyun 		}
836*4882a593Smuzhiyun 	}
837*4882a593Smuzhiyun 
838*4882a593Smuzhiyun 	/*
839*4882a593Smuzhiyun 	 * Read the PTE from the process' radix tree and use that
840*4882a593Smuzhiyun 	 * so we get the shift and attribute bits.
841*4882a593Smuzhiyun 	 */
842*4882a593Smuzhiyun 	spin_lock(&kvm->mmu_lock);
843*4882a593Smuzhiyun 	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
844*4882a593Smuzhiyun 	pte = __pte(0);
845*4882a593Smuzhiyun 	if (ptep)
846*4882a593Smuzhiyun 		pte = READ_ONCE(*ptep);
847*4882a593Smuzhiyun 	spin_unlock(&kvm->mmu_lock);
848*4882a593Smuzhiyun 	/*
849*4882a593Smuzhiyun 	 * If the PTE disappeared temporarily due to a THP
850*4882a593Smuzhiyun 	 * collapse, just return and let the guest try again.
851*4882a593Smuzhiyun 	 */
852*4882a593Smuzhiyun 	if (!pte_present(pte)) {
853*4882a593Smuzhiyun 		if (page)
854*4882a593Smuzhiyun 			put_page(page);
855*4882a593Smuzhiyun 		return RESUME_GUEST;
856*4882a593Smuzhiyun 	}
857*4882a593Smuzhiyun 
858*4882a593Smuzhiyun 	/* If we're logging dirty pages, always map single pages */
859*4882a593Smuzhiyun 	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
860*4882a593Smuzhiyun 
861*4882a593Smuzhiyun 	/* Get pte level from shift/size */
862*4882a593Smuzhiyun 	if (large_enable && shift == PUD_SHIFT &&
863*4882a593Smuzhiyun 	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
864*4882a593Smuzhiyun 	    (hva & (PUD_SIZE - PAGE_SIZE))) {
865*4882a593Smuzhiyun 		level = 2;
866*4882a593Smuzhiyun 	} else if (large_enable && shift == PMD_SHIFT &&
867*4882a593Smuzhiyun 		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
868*4882a593Smuzhiyun 		   (hva & (PMD_SIZE - PAGE_SIZE))) {
869*4882a593Smuzhiyun 		level = 1;
870*4882a593Smuzhiyun 	} else {
871*4882a593Smuzhiyun 		level = 0;
872*4882a593Smuzhiyun 		if (shift > PAGE_SHIFT) {
873*4882a593Smuzhiyun 			/*
874*4882a593Smuzhiyun 			 * If the pte maps more than one page, bring over
875*4882a593Smuzhiyun 			 * bits from the virtual address to get the real
876*4882a593Smuzhiyun 			 * address of the specific single page we want.
877*4882a593Smuzhiyun 			 */
878*4882a593Smuzhiyun 			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
879*4882a593Smuzhiyun 			pte = __pte(pte_val(pte) | (hva & rpnmask));
880*4882a593Smuzhiyun 		}
881*4882a593Smuzhiyun 	}
882*4882a593Smuzhiyun 
883*4882a593Smuzhiyun 	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
884*4882a593Smuzhiyun 	if (writing || upgrade_write) {
885*4882a593Smuzhiyun 		if (pte_val(pte) & _PAGE_WRITE)
886*4882a593Smuzhiyun 			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
887*4882a593Smuzhiyun 	} else {
888*4882a593Smuzhiyun 		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
889*4882a593Smuzhiyun 	}
890*4882a593Smuzhiyun 
891*4882a593Smuzhiyun 	/* Allocate space in the tree and write the PTE */
892*4882a593Smuzhiyun 	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
893*4882a593Smuzhiyun 				mmu_seq, kvm->arch.lpid, NULL, NULL);
894*4882a593Smuzhiyun 	if (inserted_pte)
895*4882a593Smuzhiyun 		*inserted_pte = pte;
896*4882a593Smuzhiyun 	if (levelp)
897*4882a593Smuzhiyun 		*levelp = level;
898*4882a593Smuzhiyun 
899*4882a593Smuzhiyun 	if (page) {
900*4882a593Smuzhiyun 		if (!ret && (pte_val(pte) & _PAGE_WRITE))
901*4882a593Smuzhiyun 			set_page_dirty_lock(page);
902*4882a593Smuzhiyun 		put_page(page);
903*4882a593Smuzhiyun 	}
904*4882a593Smuzhiyun 
905*4882a593Smuzhiyun 	/* Increment number of large pages if we (successfully) inserted one */
906*4882a593Smuzhiyun 	if (!ret) {
907*4882a593Smuzhiyun 		if (level == 1)
908*4882a593Smuzhiyun 			kvm->stat.num_2M_pages++;
909*4882a593Smuzhiyun 		else if (level == 2)
910*4882a593Smuzhiyun 			kvm->stat.num_1G_pages++;
911*4882a593Smuzhiyun 	}
912*4882a593Smuzhiyun 
913*4882a593Smuzhiyun 	return ret;
914*4882a593Smuzhiyun }
915*4882a593Smuzhiyun 
kvmppc_book3s_radix_page_fault(struct kvm_vcpu * vcpu,unsigned long ea,unsigned long dsisr)916*4882a593Smuzhiyun int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
917*4882a593Smuzhiyun 				   unsigned long ea, unsigned long dsisr)
918*4882a593Smuzhiyun {
919*4882a593Smuzhiyun 	struct kvm *kvm = vcpu->kvm;
920*4882a593Smuzhiyun 	unsigned long gpa, gfn;
921*4882a593Smuzhiyun 	struct kvm_memory_slot *memslot;
922*4882a593Smuzhiyun 	long ret;
923*4882a593Smuzhiyun 	bool writing = !!(dsisr & DSISR_ISSTORE);
924*4882a593Smuzhiyun 	bool kvm_ro = false;
925*4882a593Smuzhiyun 
926*4882a593Smuzhiyun 	/* Check for unusual errors */
927*4882a593Smuzhiyun 	if (dsisr & DSISR_UNSUPP_MMU) {
928*4882a593Smuzhiyun 		pr_err("KVM: Got unsupported MMU fault\n");
929*4882a593Smuzhiyun 		return -EFAULT;
930*4882a593Smuzhiyun 	}
931*4882a593Smuzhiyun 	if (dsisr & DSISR_BADACCESS) {
932*4882a593Smuzhiyun 		/* Reflect to the guest as DSI */
933*4882a593Smuzhiyun 		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
934*4882a593Smuzhiyun 		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
935*4882a593Smuzhiyun 		return RESUME_GUEST;
936*4882a593Smuzhiyun 	}
937*4882a593Smuzhiyun 
938*4882a593Smuzhiyun 	/* Translate the logical address */
939*4882a593Smuzhiyun 	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
940*4882a593Smuzhiyun 	gpa &= ~0xF000000000000000ul;
941*4882a593Smuzhiyun 	gfn = gpa >> PAGE_SHIFT;
942*4882a593Smuzhiyun 	if (!(dsisr & DSISR_PRTABLE_FAULT))
943*4882a593Smuzhiyun 		gpa |= ea & 0xfff;
944*4882a593Smuzhiyun 
945*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
946*4882a593Smuzhiyun 		return kvmppc_send_page_to_uv(kvm, gfn);
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun 	/* Get the corresponding memslot */
949*4882a593Smuzhiyun 	memslot = gfn_to_memslot(kvm, gfn);
950*4882a593Smuzhiyun 
951*4882a593Smuzhiyun 	/* No memslot means it's an emulated MMIO region */
952*4882a593Smuzhiyun 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
953*4882a593Smuzhiyun 		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
954*4882a593Smuzhiyun 			     DSISR_SET_RC)) {
955*4882a593Smuzhiyun 			/*
956*4882a593Smuzhiyun 			 * Bad address in guest page table tree, or other
957*4882a593Smuzhiyun 			 * unusual error - reflect it to the guest as DSI.
958*4882a593Smuzhiyun 			 */
959*4882a593Smuzhiyun 			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
960*4882a593Smuzhiyun 			return RESUME_GUEST;
961*4882a593Smuzhiyun 		}
962*4882a593Smuzhiyun 		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
963*4882a593Smuzhiyun 	}
964*4882a593Smuzhiyun 
965*4882a593Smuzhiyun 	if (memslot->flags & KVM_MEM_READONLY) {
966*4882a593Smuzhiyun 		if (writing) {
967*4882a593Smuzhiyun 			/* give the guest a DSI */
968*4882a593Smuzhiyun 			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
969*4882a593Smuzhiyun 						       DSISR_PROTFAULT);
970*4882a593Smuzhiyun 			return RESUME_GUEST;
971*4882a593Smuzhiyun 		}
972*4882a593Smuzhiyun 		kvm_ro = true;
973*4882a593Smuzhiyun 	}
974*4882a593Smuzhiyun 
975*4882a593Smuzhiyun 	/* Failed to set the reference/change bits */
976*4882a593Smuzhiyun 	if (dsisr & DSISR_SET_RC) {
977*4882a593Smuzhiyun 		spin_lock(&kvm->mmu_lock);
978*4882a593Smuzhiyun 		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
979*4882a593Smuzhiyun 					    gpa, kvm->arch.lpid))
980*4882a593Smuzhiyun 			dsisr &= ~DSISR_SET_RC;
981*4882a593Smuzhiyun 		spin_unlock(&kvm->mmu_lock);
982*4882a593Smuzhiyun 
983*4882a593Smuzhiyun 		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
984*4882a593Smuzhiyun 			       DSISR_PROTFAULT | DSISR_SET_RC)))
985*4882a593Smuzhiyun 			return RESUME_GUEST;
986*4882a593Smuzhiyun 	}
987*4882a593Smuzhiyun 
988*4882a593Smuzhiyun 	/* Try to insert a pte */
989*4882a593Smuzhiyun 	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
990*4882a593Smuzhiyun 					     kvm_ro, NULL, NULL);
991*4882a593Smuzhiyun 
992*4882a593Smuzhiyun 	if (ret == 0 || ret == -EAGAIN)
993*4882a593Smuzhiyun 		ret = RESUME_GUEST;
994*4882a593Smuzhiyun 	return ret;
995*4882a593Smuzhiyun }
996*4882a593Smuzhiyun 
997*4882a593Smuzhiyun /* Called with kvm->mmu_lock held */
kvm_unmap_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)998*4882a593Smuzhiyun int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
999*4882a593Smuzhiyun 		    unsigned long gfn)
1000*4882a593Smuzhiyun {
1001*4882a593Smuzhiyun 	pte_t *ptep;
1002*4882a593Smuzhiyun 	unsigned long gpa = gfn << PAGE_SHIFT;
1003*4882a593Smuzhiyun 	unsigned int shift;
1004*4882a593Smuzhiyun 
1005*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1006*4882a593Smuzhiyun 		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1007*4882a593Smuzhiyun 		return 0;
1008*4882a593Smuzhiyun 	}
1009*4882a593Smuzhiyun 
1010*4882a593Smuzhiyun 	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1011*4882a593Smuzhiyun 	if (ptep && pte_present(*ptep))
1012*4882a593Smuzhiyun 		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1013*4882a593Smuzhiyun 				 kvm->arch.lpid);
1014*4882a593Smuzhiyun 	return 0;
1015*4882a593Smuzhiyun }
1016*4882a593Smuzhiyun 
1017*4882a593Smuzhiyun /* Called with kvm->mmu_lock held */
kvm_age_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)1018*4882a593Smuzhiyun int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1019*4882a593Smuzhiyun 		  unsigned long gfn)
1020*4882a593Smuzhiyun {
1021*4882a593Smuzhiyun 	pte_t *ptep;
1022*4882a593Smuzhiyun 	unsigned long gpa = gfn << PAGE_SHIFT;
1023*4882a593Smuzhiyun 	unsigned int shift;
1024*4882a593Smuzhiyun 	int ref = 0;
1025*4882a593Smuzhiyun 	unsigned long old, *rmapp;
1026*4882a593Smuzhiyun 
1027*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1028*4882a593Smuzhiyun 		return ref;
1029*4882a593Smuzhiyun 
1030*4882a593Smuzhiyun 	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1031*4882a593Smuzhiyun 	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1032*4882a593Smuzhiyun 		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1033*4882a593Smuzhiyun 					      gpa, shift);
1034*4882a593Smuzhiyun 		/* XXX need to flush tlb here? */
1035*4882a593Smuzhiyun 		/* Also clear bit in ptes in shadow pgtable for nested guests */
1036*4882a593Smuzhiyun 		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1037*4882a593Smuzhiyun 		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1038*4882a593Smuzhiyun 					       old & PTE_RPN_MASK,
1039*4882a593Smuzhiyun 					       1UL << shift);
1040*4882a593Smuzhiyun 		ref = 1;
1041*4882a593Smuzhiyun 	}
1042*4882a593Smuzhiyun 	return ref;
1043*4882a593Smuzhiyun }
1044*4882a593Smuzhiyun 
1045*4882a593Smuzhiyun /* Called with kvm->mmu_lock held */
kvm_test_age_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)1046*4882a593Smuzhiyun int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1047*4882a593Smuzhiyun 		       unsigned long gfn)
1048*4882a593Smuzhiyun {
1049*4882a593Smuzhiyun 	pte_t *ptep;
1050*4882a593Smuzhiyun 	unsigned long gpa = gfn << PAGE_SHIFT;
1051*4882a593Smuzhiyun 	unsigned int shift;
1052*4882a593Smuzhiyun 	int ref = 0;
1053*4882a593Smuzhiyun 
1054*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1055*4882a593Smuzhiyun 		return ref;
1056*4882a593Smuzhiyun 
1057*4882a593Smuzhiyun 	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1058*4882a593Smuzhiyun 	if (ptep && pte_present(*ptep) && pte_young(*ptep))
1059*4882a593Smuzhiyun 		ref = 1;
1060*4882a593Smuzhiyun 	return ref;
1061*4882a593Smuzhiyun }
1062*4882a593Smuzhiyun 
1063*4882a593Smuzhiyun /* Returns the number of PAGE_SIZE pages that are dirty */
kvm_radix_test_clear_dirty(struct kvm * kvm,struct kvm_memory_slot * memslot,int pagenum)1064*4882a593Smuzhiyun static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1065*4882a593Smuzhiyun 				struct kvm_memory_slot *memslot, int pagenum)
1066*4882a593Smuzhiyun {
1067*4882a593Smuzhiyun 	unsigned long gfn = memslot->base_gfn + pagenum;
1068*4882a593Smuzhiyun 	unsigned long gpa = gfn << PAGE_SHIFT;
1069*4882a593Smuzhiyun 	pte_t *ptep, pte;
1070*4882a593Smuzhiyun 	unsigned int shift;
1071*4882a593Smuzhiyun 	int ret = 0;
1072*4882a593Smuzhiyun 	unsigned long old, *rmapp;
1073*4882a593Smuzhiyun 
1074*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1075*4882a593Smuzhiyun 		return ret;
1076*4882a593Smuzhiyun 
1077*4882a593Smuzhiyun 	/*
1078*4882a593Smuzhiyun 	 * For performance reasons we don't hold kvm->mmu_lock while walking the
1079*4882a593Smuzhiyun 	 * partition scoped table.
1080*4882a593Smuzhiyun 	 */
1081*4882a593Smuzhiyun 	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1082*4882a593Smuzhiyun 	if (!ptep)
1083*4882a593Smuzhiyun 		return 0;
1084*4882a593Smuzhiyun 
1085*4882a593Smuzhiyun 	pte = READ_ONCE(*ptep);
1086*4882a593Smuzhiyun 	if (pte_present(pte) && pte_dirty(pte)) {
1087*4882a593Smuzhiyun 		spin_lock(&kvm->mmu_lock);
1088*4882a593Smuzhiyun 		/*
1089*4882a593Smuzhiyun 		 * Recheck the pte again
1090*4882a593Smuzhiyun 		 */
1091*4882a593Smuzhiyun 		if (pte_val(pte) != pte_val(*ptep)) {
1092*4882a593Smuzhiyun 			/*
1093*4882a593Smuzhiyun 			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1094*4882a593Smuzhiyun 			 * only find PAGE_SIZE pte entries here. We can continue
1095*4882a593Smuzhiyun 			 * to use the pte addr returned by above page table
1096*4882a593Smuzhiyun 			 * walk.
1097*4882a593Smuzhiyun 			 */
1098*4882a593Smuzhiyun 			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1099*4882a593Smuzhiyun 				spin_unlock(&kvm->mmu_lock);
1100*4882a593Smuzhiyun 				return 0;
1101*4882a593Smuzhiyun 			}
1102*4882a593Smuzhiyun 		}
1103*4882a593Smuzhiyun 
1104*4882a593Smuzhiyun 		ret = 1;
1105*4882a593Smuzhiyun 		VM_BUG_ON(shift);
1106*4882a593Smuzhiyun 		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1107*4882a593Smuzhiyun 					      gpa, shift);
1108*4882a593Smuzhiyun 		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1109*4882a593Smuzhiyun 		/* Also clear bit in ptes in shadow pgtable for nested guests */
1110*4882a593Smuzhiyun 		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1111*4882a593Smuzhiyun 		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1112*4882a593Smuzhiyun 					       old & PTE_RPN_MASK,
1113*4882a593Smuzhiyun 					       1UL << shift);
1114*4882a593Smuzhiyun 		spin_unlock(&kvm->mmu_lock);
1115*4882a593Smuzhiyun 	}
1116*4882a593Smuzhiyun 	return ret;
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun 
kvmppc_hv_get_dirty_log_radix(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long * map)1119*4882a593Smuzhiyun long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1120*4882a593Smuzhiyun 			struct kvm_memory_slot *memslot, unsigned long *map)
1121*4882a593Smuzhiyun {
1122*4882a593Smuzhiyun 	unsigned long i, j;
1123*4882a593Smuzhiyun 	int npages;
1124*4882a593Smuzhiyun 
1125*4882a593Smuzhiyun 	for (i = 0; i < memslot->npages; i = j) {
1126*4882a593Smuzhiyun 		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1127*4882a593Smuzhiyun 
1128*4882a593Smuzhiyun 		/*
1129*4882a593Smuzhiyun 		 * Note that if npages > 0 then i must be a multiple of npages,
1130*4882a593Smuzhiyun 		 * since huge pages are only used to back the guest at guest
1131*4882a593Smuzhiyun 		 * real addresses that are a multiple of their size.
1132*4882a593Smuzhiyun 		 * Since we have at most one PTE covering any given guest
1133*4882a593Smuzhiyun 		 * real address, if npages > 1 we can skip to i + npages.
1134*4882a593Smuzhiyun 		 */
1135*4882a593Smuzhiyun 		j = i + 1;
1136*4882a593Smuzhiyun 		if (npages) {
1137*4882a593Smuzhiyun 			set_dirty_bits(map, i, npages);
1138*4882a593Smuzhiyun 			j = i + npages;
1139*4882a593Smuzhiyun 		}
1140*4882a593Smuzhiyun 	}
1141*4882a593Smuzhiyun 	return 0;
1142*4882a593Smuzhiyun }
1143*4882a593Smuzhiyun 
kvmppc_radix_flush_memslot(struct kvm * kvm,const struct kvm_memory_slot * memslot)1144*4882a593Smuzhiyun void kvmppc_radix_flush_memslot(struct kvm *kvm,
1145*4882a593Smuzhiyun 				const struct kvm_memory_slot *memslot)
1146*4882a593Smuzhiyun {
1147*4882a593Smuzhiyun 	unsigned long n;
1148*4882a593Smuzhiyun 	pte_t *ptep;
1149*4882a593Smuzhiyun 	unsigned long gpa;
1150*4882a593Smuzhiyun 	unsigned int shift;
1151*4882a593Smuzhiyun 
1152*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1153*4882a593Smuzhiyun 		kvmppc_uvmem_drop_pages(memslot, kvm, true);
1154*4882a593Smuzhiyun 
1155*4882a593Smuzhiyun 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1156*4882a593Smuzhiyun 		return;
1157*4882a593Smuzhiyun 
1158*4882a593Smuzhiyun 	gpa = memslot->base_gfn << PAGE_SHIFT;
1159*4882a593Smuzhiyun 	spin_lock(&kvm->mmu_lock);
1160*4882a593Smuzhiyun 	for (n = memslot->npages; n; --n) {
1161*4882a593Smuzhiyun 		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1162*4882a593Smuzhiyun 		if (ptep && pte_present(*ptep))
1163*4882a593Smuzhiyun 			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1164*4882a593Smuzhiyun 					 kvm->arch.lpid);
1165*4882a593Smuzhiyun 		gpa += PAGE_SIZE;
1166*4882a593Smuzhiyun 	}
1167*4882a593Smuzhiyun 	/*
1168*4882a593Smuzhiyun 	 * Increase the mmu notifier sequence number to prevent any page
1169*4882a593Smuzhiyun 	 * fault that read the memslot earlier from writing a PTE.
1170*4882a593Smuzhiyun 	 */
1171*4882a593Smuzhiyun 	kvm->mmu_notifier_seq++;
1172*4882a593Smuzhiyun 	spin_unlock(&kvm->mmu_lock);
1173*4882a593Smuzhiyun }
1174*4882a593Smuzhiyun 
add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info * info,int psize,int * indexp)1175*4882a593Smuzhiyun static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1176*4882a593Smuzhiyun 				 int psize, int *indexp)
1177*4882a593Smuzhiyun {
1178*4882a593Smuzhiyun 	if (!mmu_psize_defs[psize].shift)
1179*4882a593Smuzhiyun 		return;
1180*4882a593Smuzhiyun 	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1181*4882a593Smuzhiyun 		(mmu_psize_defs[psize].ap << 29);
1182*4882a593Smuzhiyun 	++(*indexp);
1183*4882a593Smuzhiyun }
1184*4882a593Smuzhiyun 
kvmhv_get_rmmu_info(struct kvm * kvm,struct kvm_ppc_rmmu_info * info)1185*4882a593Smuzhiyun int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1186*4882a593Smuzhiyun {
1187*4882a593Smuzhiyun 	int i;
1188*4882a593Smuzhiyun 
1189*4882a593Smuzhiyun 	if (!radix_enabled())
1190*4882a593Smuzhiyun 		return -EINVAL;
1191*4882a593Smuzhiyun 	memset(info, 0, sizeof(*info));
1192*4882a593Smuzhiyun 
1193*4882a593Smuzhiyun 	/* 4k page size */
1194*4882a593Smuzhiyun 	info->geometries[0].page_shift = 12;
1195*4882a593Smuzhiyun 	info->geometries[0].level_bits[0] = 9;
1196*4882a593Smuzhiyun 	for (i = 1; i < 4; ++i)
1197*4882a593Smuzhiyun 		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1198*4882a593Smuzhiyun 	/* 64k page size */
1199*4882a593Smuzhiyun 	info->geometries[1].page_shift = 16;
1200*4882a593Smuzhiyun 	for (i = 0; i < 4; ++i)
1201*4882a593Smuzhiyun 		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1202*4882a593Smuzhiyun 
1203*4882a593Smuzhiyun 	i = 0;
1204*4882a593Smuzhiyun 	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1205*4882a593Smuzhiyun 	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1206*4882a593Smuzhiyun 	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1207*4882a593Smuzhiyun 	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1208*4882a593Smuzhiyun 
1209*4882a593Smuzhiyun 	return 0;
1210*4882a593Smuzhiyun }
1211*4882a593Smuzhiyun 
kvmppc_init_vm_radix(struct kvm * kvm)1212*4882a593Smuzhiyun int kvmppc_init_vm_radix(struct kvm *kvm)
1213*4882a593Smuzhiyun {
1214*4882a593Smuzhiyun 	kvm->arch.pgtable = pgd_alloc(kvm->mm);
1215*4882a593Smuzhiyun 	if (!kvm->arch.pgtable)
1216*4882a593Smuzhiyun 		return -ENOMEM;
1217*4882a593Smuzhiyun 	return 0;
1218*4882a593Smuzhiyun }
1219*4882a593Smuzhiyun 
pte_ctor(void * addr)1220*4882a593Smuzhiyun static void pte_ctor(void *addr)
1221*4882a593Smuzhiyun {
1222*4882a593Smuzhiyun 	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1223*4882a593Smuzhiyun }
1224*4882a593Smuzhiyun 
pmd_ctor(void * addr)1225*4882a593Smuzhiyun static void pmd_ctor(void *addr)
1226*4882a593Smuzhiyun {
1227*4882a593Smuzhiyun 	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1228*4882a593Smuzhiyun }
1229*4882a593Smuzhiyun 
1230*4882a593Smuzhiyun struct debugfs_radix_state {
1231*4882a593Smuzhiyun 	struct kvm	*kvm;
1232*4882a593Smuzhiyun 	struct mutex	mutex;
1233*4882a593Smuzhiyun 	unsigned long	gpa;
1234*4882a593Smuzhiyun 	int		lpid;
1235*4882a593Smuzhiyun 	int		chars_left;
1236*4882a593Smuzhiyun 	int		buf_index;
1237*4882a593Smuzhiyun 	char		buf[128];
1238*4882a593Smuzhiyun 	u8		hdr;
1239*4882a593Smuzhiyun };
1240*4882a593Smuzhiyun 
debugfs_radix_open(struct inode * inode,struct file * file)1241*4882a593Smuzhiyun static int debugfs_radix_open(struct inode *inode, struct file *file)
1242*4882a593Smuzhiyun {
1243*4882a593Smuzhiyun 	struct kvm *kvm = inode->i_private;
1244*4882a593Smuzhiyun 	struct debugfs_radix_state *p;
1245*4882a593Smuzhiyun 
1246*4882a593Smuzhiyun 	p = kzalloc(sizeof(*p), GFP_KERNEL);
1247*4882a593Smuzhiyun 	if (!p)
1248*4882a593Smuzhiyun 		return -ENOMEM;
1249*4882a593Smuzhiyun 
1250*4882a593Smuzhiyun 	kvm_get_kvm(kvm);
1251*4882a593Smuzhiyun 	p->kvm = kvm;
1252*4882a593Smuzhiyun 	mutex_init(&p->mutex);
1253*4882a593Smuzhiyun 	file->private_data = p;
1254*4882a593Smuzhiyun 
1255*4882a593Smuzhiyun 	return nonseekable_open(inode, file);
1256*4882a593Smuzhiyun }
1257*4882a593Smuzhiyun 
debugfs_radix_release(struct inode * inode,struct file * file)1258*4882a593Smuzhiyun static int debugfs_radix_release(struct inode *inode, struct file *file)
1259*4882a593Smuzhiyun {
1260*4882a593Smuzhiyun 	struct debugfs_radix_state *p = file->private_data;
1261*4882a593Smuzhiyun 
1262*4882a593Smuzhiyun 	kvm_put_kvm(p->kvm);
1263*4882a593Smuzhiyun 	kfree(p);
1264*4882a593Smuzhiyun 	return 0;
1265*4882a593Smuzhiyun }
1266*4882a593Smuzhiyun 
debugfs_radix_read(struct file * file,char __user * buf,size_t len,loff_t * ppos)1267*4882a593Smuzhiyun static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1268*4882a593Smuzhiyun 				 size_t len, loff_t *ppos)
1269*4882a593Smuzhiyun {
1270*4882a593Smuzhiyun 	struct debugfs_radix_state *p = file->private_data;
1271*4882a593Smuzhiyun 	ssize_t ret, r;
1272*4882a593Smuzhiyun 	unsigned long n;
1273*4882a593Smuzhiyun 	struct kvm *kvm;
1274*4882a593Smuzhiyun 	unsigned long gpa;
1275*4882a593Smuzhiyun 	pgd_t *pgt;
1276*4882a593Smuzhiyun 	struct kvm_nested_guest *nested;
1277*4882a593Smuzhiyun 	pgd_t *pgdp;
1278*4882a593Smuzhiyun 	p4d_t p4d, *p4dp;
1279*4882a593Smuzhiyun 	pud_t pud, *pudp;
1280*4882a593Smuzhiyun 	pmd_t pmd, *pmdp;
1281*4882a593Smuzhiyun 	pte_t *ptep;
1282*4882a593Smuzhiyun 	int shift;
1283*4882a593Smuzhiyun 	unsigned long pte;
1284*4882a593Smuzhiyun 
1285*4882a593Smuzhiyun 	kvm = p->kvm;
1286*4882a593Smuzhiyun 	if (!kvm_is_radix(kvm))
1287*4882a593Smuzhiyun 		return 0;
1288*4882a593Smuzhiyun 
1289*4882a593Smuzhiyun 	ret = mutex_lock_interruptible(&p->mutex);
1290*4882a593Smuzhiyun 	if (ret)
1291*4882a593Smuzhiyun 		return ret;
1292*4882a593Smuzhiyun 
1293*4882a593Smuzhiyun 	if (p->chars_left) {
1294*4882a593Smuzhiyun 		n = p->chars_left;
1295*4882a593Smuzhiyun 		if (n > len)
1296*4882a593Smuzhiyun 			n = len;
1297*4882a593Smuzhiyun 		r = copy_to_user(buf, p->buf + p->buf_index, n);
1298*4882a593Smuzhiyun 		n -= r;
1299*4882a593Smuzhiyun 		p->chars_left -= n;
1300*4882a593Smuzhiyun 		p->buf_index += n;
1301*4882a593Smuzhiyun 		buf += n;
1302*4882a593Smuzhiyun 		len -= n;
1303*4882a593Smuzhiyun 		ret = n;
1304*4882a593Smuzhiyun 		if (r) {
1305*4882a593Smuzhiyun 			if (!n)
1306*4882a593Smuzhiyun 				ret = -EFAULT;
1307*4882a593Smuzhiyun 			goto out;
1308*4882a593Smuzhiyun 		}
1309*4882a593Smuzhiyun 	}
1310*4882a593Smuzhiyun 
1311*4882a593Smuzhiyun 	gpa = p->gpa;
1312*4882a593Smuzhiyun 	nested = NULL;
1313*4882a593Smuzhiyun 	pgt = NULL;
1314*4882a593Smuzhiyun 	while (len != 0 && p->lpid >= 0) {
1315*4882a593Smuzhiyun 		if (gpa >= RADIX_PGTABLE_RANGE) {
1316*4882a593Smuzhiyun 			gpa = 0;
1317*4882a593Smuzhiyun 			pgt = NULL;
1318*4882a593Smuzhiyun 			if (nested) {
1319*4882a593Smuzhiyun 				kvmhv_put_nested(nested);
1320*4882a593Smuzhiyun 				nested = NULL;
1321*4882a593Smuzhiyun 			}
1322*4882a593Smuzhiyun 			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1323*4882a593Smuzhiyun 			p->hdr = 0;
1324*4882a593Smuzhiyun 			if (p->lpid < 0)
1325*4882a593Smuzhiyun 				break;
1326*4882a593Smuzhiyun 		}
1327*4882a593Smuzhiyun 		if (!pgt) {
1328*4882a593Smuzhiyun 			if (p->lpid == 0) {
1329*4882a593Smuzhiyun 				pgt = kvm->arch.pgtable;
1330*4882a593Smuzhiyun 			} else {
1331*4882a593Smuzhiyun 				nested = kvmhv_get_nested(kvm, p->lpid, false);
1332*4882a593Smuzhiyun 				if (!nested) {
1333*4882a593Smuzhiyun 					gpa = RADIX_PGTABLE_RANGE;
1334*4882a593Smuzhiyun 					continue;
1335*4882a593Smuzhiyun 				}
1336*4882a593Smuzhiyun 				pgt = nested->shadow_pgtable;
1337*4882a593Smuzhiyun 			}
1338*4882a593Smuzhiyun 		}
1339*4882a593Smuzhiyun 		n = 0;
1340*4882a593Smuzhiyun 		if (!p->hdr) {
1341*4882a593Smuzhiyun 			if (p->lpid > 0)
1342*4882a593Smuzhiyun 				n = scnprintf(p->buf, sizeof(p->buf),
1343*4882a593Smuzhiyun 					      "\nNested LPID %d: ", p->lpid);
1344*4882a593Smuzhiyun 			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1345*4882a593Smuzhiyun 				      "pgdir: %lx\n", (unsigned long)pgt);
1346*4882a593Smuzhiyun 			p->hdr = 1;
1347*4882a593Smuzhiyun 			goto copy;
1348*4882a593Smuzhiyun 		}
1349*4882a593Smuzhiyun 
1350*4882a593Smuzhiyun 		pgdp = pgt + pgd_index(gpa);
1351*4882a593Smuzhiyun 		p4dp = p4d_offset(pgdp, gpa);
1352*4882a593Smuzhiyun 		p4d = READ_ONCE(*p4dp);
1353*4882a593Smuzhiyun 		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1354*4882a593Smuzhiyun 			gpa = (gpa & P4D_MASK) + P4D_SIZE;
1355*4882a593Smuzhiyun 			continue;
1356*4882a593Smuzhiyun 		}
1357*4882a593Smuzhiyun 
1358*4882a593Smuzhiyun 		pudp = pud_offset(&p4d, gpa);
1359*4882a593Smuzhiyun 		pud = READ_ONCE(*pudp);
1360*4882a593Smuzhiyun 		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1361*4882a593Smuzhiyun 			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1362*4882a593Smuzhiyun 			continue;
1363*4882a593Smuzhiyun 		}
1364*4882a593Smuzhiyun 		if (pud_val(pud) & _PAGE_PTE) {
1365*4882a593Smuzhiyun 			pte = pud_val(pud);
1366*4882a593Smuzhiyun 			shift = PUD_SHIFT;
1367*4882a593Smuzhiyun 			goto leaf;
1368*4882a593Smuzhiyun 		}
1369*4882a593Smuzhiyun 
1370*4882a593Smuzhiyun 		pmdp = pmd_offset(&pud, gpa);
1371*4882a593Smuzhiyun 		pmd = READ_ONCE(*pmdp);
1372*4882a593Smuzhiyun 		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1373*4882a593Smuzhiyun 			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1374*4882a593Smuzhiyun 			continue;
1375*4882a593Smuzhiyun 		}
1376*4882a593Smuzhiyun 		if (pmd_val(pmd) & _PAGE_PTE) {
1377*4882a593Smuzhiyun 			pte = pmd_val(pmd);
1378*4882a593Smuzhiyun 			shift = PMD_SHIFT;
1379*4882a593Smuzhiyun 			goto leaf;
1380*4882a593Smuzhiyun 		}
1381*4882a593Smuzhiyun 
1382*4882a593Smuzhiyun 		ptep = pte_offset_kernel(&pmd, gpa);
1383*4882a593Smuzhiyun 		pte = pte_val(READ_ONCE(*ptep));
1384*4882a593Smuzhiyun 		if (!(pte & _PAGE_PRESENT)) {
1385*4882a593Smuzhiyun 			gpa += PAGE_SIZE;
1386*4882a593Smuzhiyun 			continue;
1387*4882a593Smuzhiyun 		}
1388*4882a593Smuzhiyun 		shift = PAGE_SHIFT;
1389*4882a593Smuzhiyun 	leaf:
1390*4882a593Smuzhiyun 		n = scnprintf(p->buf, sizeof(p->buf),
1391*4882a593Smuzhiyun 			      " %lx: %lx %d\n", gpa, pte, shift);
1392*4882a593Smuzhiyun 		gpa += 1ul << shift;
1393*4882a593Smuzhiyun 	copy:
1394*4882a593Smuzhiyun 		p->chars_left = n;
1395*4882a593Smuzhiyun 		if (n > len)
1396*4882a593Smuzhiyun 			n = len;
1397*4882a593Smuzhiyun 		r = copy_to_user(buf, p->buf, n);
1398*4882a593Smuzhiyun 		n -= r;
1399*4882a593Smuzhiyun 		p->chars_left -= n;
1400*4882a593Smuzhiyun 		p->buf_index = n;
1401*4882a593Smuzhiyun 		buf += n;
1402*4882a593Smuzhiyun 		len -= n;
1403*4882a593Smuzhiyun 		ret += n;
1404*4882a593Smuzhiyun 		if (r) {
1405*4882a593Smuzhiyun 			if (!ret)
1406*4882a593Smuzhiyun 				ret = -EFAULT;
1407*4882a593Smuzhiyun 			break;
1408*4882a593Smuzhiyun 		}
1409*4882a593Smuzhiyun 	}
1410*4882a593Smuzhiyun 	p->gpa = gpa;
1411*4882a593Smuzhiyun 	if (nested)
1412*4882a593Smuzhiyun 		kvmhv_put_nested(nested);
1413*4882a593Smuzhiyun 
1414*4882a593Smuzhiyun  out:
1415*4882a593Smuzhiyun 	mutex_unlock(&p->mutex);
1416*4882a593Smuzhiyun 	return ret;
1417*4882a593Smuzhiyun }
1418*4882a593Smuzhiyun 
debugfs_radix_write(struct file * file,const char __user * buf,size_t len,loff_t * ppos)1419*4882a593Smuzhiyun static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1420*4882a593Smuzhiyun 			   size_t len, loff_t *ppos)
1421*4882a593Smuzhiyun {
1422*4882a593Smuzhiyun 	return -EACCES;
1423*4882a593Smuzhiyun }
1424*4882a593Smuzhiyun 
1425*4882a593Smuzhiyun static const struct file_operations debugfs_radix_fops = {
1426*4882a593Smuzhiyun 	.owner	 = THIS_MODULE,
1427*4882a593Smuzhiyun 	.open	 = debugfs_radix_open,
1428*4882a593Smuzhiyun 	.release = debugfs_radix_release,
1429*4882a593Smuzhiyun 	.read	 = debugfs_radix_read,
1430*4882a593Smuzhiyun 	.write	 = debugfs_radix_write,
1431*4882a593Smuzhiyun 	.llseek	 = generic_file_llseek,
1432*4882a593Smuzhiyun };
1433*4882a593Smuzhiyun 
kvmhv_radix_debugfs_init(struct kvm * kvm)1434*4882a593Smuzhiyun void kvmhv_radix_debugfs_init(struct kvm *kvm)
1435*4882a593Smuzhiyun {
1436*4882a593Smuzhiyun 	debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm,
1437*4882a593Smuzhiyun 			    &debugfs_radix_fops);
1438*4882a593Smuzhiyun }
1439*4882a593Smuzhiyun 
kvmppc_radix_init(void)1440*4882a593Smuzhiyun int kvmppc_radix_init(void)
1441*4882a593Smuzhiyun {
1442*4882a593Smuzhiyun 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1443*4882a593Smuzhiyun 
1444*4882a593Smuzhiyun 	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1445*4882a593Smuzhiyun 	if (!kvm_pte_cache)
1446*4882a593Smuzhiyun 		return -ENOMEM;
1447*4882a593Smuzhiyun 
1448*4882a593Smuzhiyun 	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1449*4882a593Smuzhiyun 
1450*4882a593Smuzhiyun 	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1451*4882a593Smuzhiyun 	if (!kvm_pmd_cache) {
1452*4882a593Smuzhiyun 		kmem_cache_destroy(kvm_pte_cache);
1453*4882a593Smuzhiyun 		return -ENOMEM;
1454*4882a593Smuzhiyun 	}
1455*4882a593Smuzhiyun 
1456*4882a593Smuzhiyun 	return 0;
1457*4882a593Smuzhiyun }
1458*4882a593Smuzhiyun 
kvmppc_radix_exit(void)1459*4882a593Smuzhiyun void kvmppc_radix_exit(void)
1460*4882a593Smuzhiyun {
1461*4882a593Smuzhiyun 	kmem_cache_destroy(kvm_pte_cache);
1462*4882a593Smuzhiyun 	kmem_cache_destroy(kvm_pmd_cache);
1463*4882a593Smuzhiyun }
1464