1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Kernel-based Virtual Machine driver for Linux
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Macros and functions to access KVM PTEs (also known as SPTEs)
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Copyright (C) 2006 Qumranet, Inc.
8*4882a593Smuzhiyun * Copyright 2020 Red Hat, Inc. and/or its affiliates.
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun #include <linux/kvm_host.h>
13*4882a593Smuzhiyun #include "mmu.h"
14*4882a593Smuzhiyun #include "mmu_internal.h"
15*4882a593Smuzhiyun #include "x86.h"
16*4882a593Smuzhiyun #include "spte.h"
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun #include <asm/e820/api.h>
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun u64 __read_mostly shadow_nx_mask;
21*4882a593Smuzhiyun u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
22*4882a593Smuzhiyun u64 __read_mostly shadow_user_mask;
23*4882a593Smuzhiyun u64 __read_mostly shadow_accessed_mask;
24*4882a593Smuzhiyun u64 __read_mostly shadow_dirty_mask;
25*4882a593Smuzhiyun u64 __read_mostly shadow_mmio_value;
26*4882a593Smuzhiyun u64 __read_mostly shadow_mmio_access_mask;
27*4882a593Smuzhiyun u64 __read_mostly shadow_present_mask;
28*4882a593Smuzhiyun u64 __read_mostly shadow_me_mask;
29*4882a593Smuzhiyun u64 __read_mostly shadow_acc_track_mask;
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
32*4882a593Smuzhiyun u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun u8 __read_mostly shadow_phys_bits;
35*4882a593Smuzhiyun
generation_mmio_spte_mask(u64 gen)36*4882a593Smuzhiyun static u64 generation_mmio_spte_mask(u64 gen)
37*4882a593Smuzhiyun {
38*4882a593Smuzhiyun u64 mask;
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
41*4882a593Smuzhiyun BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
44*4882a593Smuzhiyun mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
45*4882a593Smuzhiyun return mask;
46*4882a593Smuzhiyun }
47*4882a593Smuzhiyun
make_mmio_spte(struct kvm_vcpu * vcpu,u64 gfn,unsigned int access)48*4882a593Smuzhiyun u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
49*4882a593Smuzhiyun {
50*4882a593Smuzhiyun u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
51*4882a593Smuzhiyun u64 mask = generation_mmio_spte_mask(gen);
52*4882a593Smuzhiyun u64 gpa = gfn << PAGE_SHIFT;
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun access &= shadow_mmio_access_mask;
55*4882a593Smuzhiyun mask |= shadow_mmio_value | access;
56*4882a593Smuzhiyun mask |= gpa | shadow_nonpresent_or_rsvd_mask;
57*4882a593Smuzhiyun mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
58*4882a593Smuzhiyun << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun return mask;
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun
kvm_is_mmio_pfn(kvm_pfn_t pfn)63*4882a593Smuzhiyun static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun if (pfn_valid(pfn))
66*4882a593Smuzhiyun return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun * Some reserved pages, such as those from NVDIMM
69*4882a593Smuzhiyun * DAX devices, are not for MMIO, and can be mapped
70*4882a593Smuzhiyun * with cached memory type for better performance.
71*4882a593Smuzhiyun * However, the above check misconceives those pages
72*4882a593Smuzhiyun * as MMIO, and results in KVM mapping them with UC
73*4882a593Smuzhiyun * memory type, which would hurt the performance.
74*4882a593Smuzhiyun * Therefore, we check the host memory type in addition
75*4882a593Smuzhiyun * and only treat UC/UC-/WC pages as MMIO.
76*4882a593Smuzhiyun */
77*4882a593Smuzhiyun (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun return !e820__mapped_raw_any(pfn_to_hpa(pfn),
80*4882a593Smuzhiyun pfn_to_hpa(pfn + 1) - 1,
81*4882a593Smuzhiyun E820_TYPE_RAM);
82*4882a593Smuzhiyun }
83*4882a593Smuzhiyun
make_spte(struct kvm_vcpu * vcpu,unsigned int pte_access,int level,gfn_t gfn,kvm_pfn_t pfn,u64 old_spte,bool speculative,bool can_unsync,bool host_writable,bool ad_disabled,u64 * new_spte)84*4882a593Smuzhiyun int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
85*4882a593Smuzhiyun gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
86*4882a593Smuzhiyun bool can_unsync, bool host_writable, bool ad_disabled,
87*4882a593Smuzhiyun u64 *new_spte)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun u64 spte = 0;
90*4882a593Smuzhiyun int ret = 0;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun if (ad_disabled)
93*4882a593Smuzhiyun spte |= SPTE_AD_DISABLED_MASK;
94*4882a593Smuzhiyun else if (kvm_vcpu_ad_need_write_protect(vcpu))
95*4882a593Smuzhiyun spte |= SPTE_AD_WRPROT_ONLY_MASK;
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun /*
98*4882a593Smuzhiyun * For the EPT case, shadow_present_mask is 0 if hardware
99*4882a593Smuzhiyun * supports exec-only page table entries. In that case,
100*4882a593Smuzhiyun * ACC_USER_MASK and shadow_user_mask are used to represent
101*4882a593Smuzhiyun * read access. See FNAME(gpte_access) in paging_tmpl.h.
102*4882a593Smuzhiyun */
103*4882a593Smuzhiyun spte |= shadow_present_mask;
104*4882a593Smuzhiyun if (!speculative)
105*4882a593Smuzhiyun spte |= spte_shadow_accessed_mask(spte);
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
108*4882a593Smuzhiyun is_nx_huge_page_enabled()) {
109*4882a593Smuzhiyun pte_access &= ~ACC_EXEC_MASK;
110*4882a593Smuzhiyun }
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun if (pte_access & ACC_EXEC_MASK)
113*4882a593Smuzhiyun spte |= shadow_x_mask;
114*4882a593Smuzhiyun else
115*4882a593Smuzhiyun spte |= shadow_nx_mask;
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun if (pte_access & ACC_USER_MASK)
118*4882a593Smuzhiyun spte |= shadow_user_mask;
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun if (level > PG_LEVEL_4K)
121*4882a593Smuzhiyun spte |= PT_PAGE_SIZE_MASK;
122*4882a593Smuzhiyun if (tdp_enabled)
123*4882a593Smuzhiyun spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
124*4882a593Smuzhiyun kvm_is_mmio_pfn(pfn));
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun if (host_writable)
127*4882a593Smuzhiyun spte |= SPTE_HOST_WRITEABLE;
128*4882a593Smuzhiyun else
129*4882a593Smuzhiyun pte_access &= ~ACC_WRITE_MASK;
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun if (!kvm_is_mmio_pfn(pfn))
132*4882a593Smuzhiyun spte |= shadow_me_mask;
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun spte |= (u64)pfn << PAGE_SHIFT;
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun if (pte_access & ACC_WRITE_MASK) {
137*4882a593Smuzhiyun spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun /*
140*4882a593Smuzhiyun * Optimization: for pte sync, if spte was writable the hash
141*4882a593Smuzhiyun * lookup is unnecessary (and expensive). Write protection
142*4882a593Smuzhiyun * is responsibility of mmu_get_page / kvm_sync_page.
143*4882a593Smuzhiyun * Same reasoning can be applied to dirty page accounting.
144*4882a593Smuzhiyun */
145*4882a593Smuzhiyun if (!can_unsync && is_writable_pte(old_spte))
146*4882a593Smuzhiyun goto out;
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
149*4882a593Smuzhiyun pgprintk("%s: found shadow page for %llx, marking ro\n",
150*4882a593Smuzhiyun __func__, gfn);
151*4882a593Smuzhiyun ret |= SET_SPTE_WRITE_PROTECTED_PT;
152*4882a593Smuzhiyun pte_access &= ~ACC_WRITE_MASK;
153*4882a593Smuzhiyun spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun }
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun if (pte_access & ACC_WRITE_MASK)
158*4882a593Smuzhiyun spte |= spte_shadow_dirty_mask(spte);
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun if (speculative)
161*4882a593Smuzhiyun spte = mark_spte_for_access_track(spte);
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun out:
164*4882a593Smuzhiyun *new_spte = spte;
165*4882a593Smuzhiyun return ret;
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun
make_nonleaf_spte(u64 * child_pt,bool ad_disabled)168*4882a593Smuzhiyun u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
169*4882a593Smuzhiyun {
170*4882a593Smuzhiyun u64 spte;
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
173*4882a593Smuzhiyun shadow_user_mask | shadow_x_mask | shadow_me_mask;
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun if (ad_disabled)
176*4882a593Smuzhiyun spte |= SPTE_AD_DISABLED_MASK;
177*4882a593Smuzhiyun else
178*4882a593Smuzhiyun spte |= shadow_accessed_mask;
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun return spte;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte,kvm_pfn_t new_pfn)183*4882a593Smuzhiyun u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
184*4882a593Smuzhiyun {
185*4882a593Smuzhiyun u64 new_spte;
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
188*4882a593Smuzhiyun new_spte |= (u64)new_pfn << PAGE_SHIFT;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun new_spte &= ~PT_WRITABLE_MASK;
191*4882a593Smuzhiyun new_spte &= ~SPTE_HOST_WRITEABLE;
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun new_spte = mark_spte_for_access_track(new_spte);
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun return new_spte;
196*4882a593Smuzhiyun }
197*4882a593Smuzhiyun
kvm_get_shadow_phys_bits(void)198*4882a593Smuzhiyun static u8 kvm_get_shadow_phys_bits(void)
199*4882a593Smuzhiyun {
200*4882a593Smuzhiyun /*
201*4882a593Smuzhiyun * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
202*4882a593Smuzhiyun * in CPU detection code, but the processor treats those reduced bits as
203*4882a593Smuzhiyun * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
204*4882a593Smuzhiyun * the physical address bits reported by CPUID.
205*4882a593Smuzhiyun */
206*4882a593Smuzhiyun if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
207*4882a593Smuzhiyun return cpuid_eax(0x80000008) & 0xff;
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun /*
210*4882a593Smuzhiyun * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
211*4882a593Smuzhiyun * custom CPUID. Proceed with whatever the kernel found since these features
212*4882a593Smuzhiyun * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
213*4882a593Smuzhiyun */
214*4882a593Smuzhiyun return boot_cpu_data.x86_phys_bits;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
mark_spte_for_access_track(u64 spte)217*4882a593Smuzhiyun u64 mark_spte_for_access_track(u64 spte)
218*4882a593Smuzhiyun {
219*4882a593Smuzhiyun if (spte_ad_enabled(spte))
220*4882a593Smuzhiyun return spte & ~shadow_accessed_mask;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun if (is_access_track_spte(spte))
223*4882a593Smuzhiyun return spte;
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun /*
226*4882a593Smuzhiyun * Making an Access Tracking PTE will result in removal of write access
227*4882a593Smuzhiyun * from the PTE. So, verify that we will be able to restore the write
228*4882a593Smuzhiyun * access in the fast page fault path later on.
229*4882a593Smuzhiyun */
230*4882a593Smuzhiyun WARN_ONCE((spte & PT_WRITABLE_MASK) &&
231*4882a593Smuzhiyun !spte_can_locklessly_be_made_writable(spte),
232*4882a593Smuzhiyun "kvm: Writable SPTE is not locklessly dirty-trackable\n");
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
235*4882a593Smuzhiyun SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
236*4882a593Smuzhiyun "kvm: Access Tracking saved bit locations are not zero\n");
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
239*4882a593Smuzhiyun SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
240*4882a593Smuzhiyun spte &= ~shadow_acc_track_mask;
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun return spte;
243*4882a593Smuzhiyun }
244*4882a593Smuzhiyun
kvm_mmu_set_mmio_spte_mask(u64 mmio_value,u64 access_mask)245*4882a593Smuzhiyun void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
246*4882a593Smuzhiyun {
247*4882a593Smuzhiyun BUG_ON((u64)(unsigned)access_mask != access_mask);
248*4882a593Smuzhiyun WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
249*4882a593Smuzhiyun WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
250*4882a593Smuzhiyun shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
251*4882a593Smuzhiyun shadow_mmio_access_mask = access_mask;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun /*
256*4882a593Smuzhiyun * Sets the shadow PTE masks used by the MMU.
257*4882a593Smuzhiyun *
258*4882a593Smuzhiyun * Assumptions:
259*4882a593Smuzhiyun * - Setting either @accessed_mask or @dirty_mask requires setting both
260*4882a593Smuzhiyun * - At least one of @accessed_mask or @acc_track_mask must be set
261*4882a593Smuzhiyun */
kvm_mmu_set_mask_ptes(u64 user_mask,u64 accessed_mask,u64 dirty_mask,u64 nx_mask,u64 x_mask,u64 p_mask,u64 acc_track_mask,u64 me_mask)262*4882a593Smuzhiyun void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
263*4882a593Smuzhiyun u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
264*4882a593Smuzhiyun u64 acc_track_mask, u64 me_mask)
265*4882a593Smuzhiyun {
266*4882a593Smuzhiyun BUG_ON(!dirty_mask != !accessed_mask);
267*4882a593Smuzhiyun BUG_ON(!accessed_mask && !acc_track_mask);
268*4882a593Smuzhiyun BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun shadow_user_mask = user_mask;
271*4882a593Smuzhiyun shadow_accessed_mask = accessed_mask;
272*4882a593Smuzhiyun shadow_dirty_mask = dirty_mask;
273*4882a593Smuzhiyun shadow_nx_mask = nx_mask;
274*4882a593Smuzhiyun shadow_x_mask = x_mask;
275*4882a593Smuzhiyun shadow_present_mask = p_mask;
276*4882a593Smuzhiyun shadow_acc_track_mask = acc_track_mask;
277*4882a593Smuzhiyun shadow_me_mask = me_mask;
278*4882a593Smuzhiyun }
279*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
280*4882a593Smuzhiyun
kvm_mmu_reset_all_pte_masks(void)281*4882a593Smuzhiyun void kvm_mmu_reset_all_pte_masks(void)
282*4882a593Smuzhiyun {
283*4882a593Smuzhiyun u8 low_phys_bits;
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun shadow_user_mask = 0;
286*4882a593Smuzhiyun shadow_accessed_mask = 0;
287*4882a593Smuzhiyun shadow_dirty_mask = 0;
288*4882a593Smuzhiyun shadow_nx_mask = 0;
289*4882a593Smuzhiyun shadow_x_mask = 0;
290*4882a593Smuzhiyun shadow_present_mask = 0;
291*4882a593Smuzhiyun shadow_acc_track_mask = 0;
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun shadow_phys_bits = kvm_get_shadow_phys_bits();
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun /*
296*4882a593Smuzhiyun * If the CPU has 46 or less physical address bits, then set an
297*4882a593Smuzhiyun * appropriate mask to guard against L1TF attacks. Otherwise, it is
298*4882a593Smuzhiyun * assumed that the CPU is not vulnerable to L1TF.
299*4882a593Smuzhiyun *
300*4882a593Smuzhiyun * Some Intel CPUs address the L1 cache using more PA bits than are
301*4882a593Smuzhiyun * reported by CPUID. Use the PA width of the L1 cache when possible
302*4882a593Smuzhiyun * to achieve more effective mitigation, e.g. if system RAM overlaps
303*4882a593Smuzhiyun * the most significant bits of legal physical address space.
304*4882a593Smuzhiyun */
305*4882a593Smuzhiyun shadow_nonpresent_or_rsvd_mask = 0;
306*4882a593Smuzhiyun low_phys_bits = boot_cpu_data.x86_phys_bits;
307*4882a593Smuzhiyun if (boot_cpu_has_bug(X86_BUG_L1TF) &&
308*4882a593Smuzhiyun !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
309*4882a593Smuzhiyun 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
310*4882a593Smuzhiyun low_phys_bits = boot_cpu_data.x86_cache_bits
311*4882a593Smuzhiyun - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
312*4882a593Smuzhiyun shadow_nonpresent_or_rsvd_mask =
313*4882a593Smuzhiyun rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
314*4882a593Smuzhiyun }
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun shadow_nonpresent_or_rsvd_lower_gfn_mask =
317*4882a593Smuzhiyun GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
318*4882a593Smuzhiyun }
319