1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * This file contains common routines for dealing with free of page tables
4*4882a593Smuzhiyun * Along with common page table handling code
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Derived from arch/powerpc/mm/tlb_64.c:
7*4882a593Smuzhiyun * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
10*4882a593Smuzhiyun * and Cort Dougan (PReP) (cort@cs.nmt.edu)
11*4882a593Smuzhiyun * Copyright (C) 1996 Paul Mackerras
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * Derived from "arch/i386/mm/init.c"
14*4882a593Smuzhiyun * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
15*4882a593Smuzhiyun *
16*4882a593Smuzhiyun * Dave Engebretsen <engebret@us.ibm.com>
17*4882a593Smuzhiyun * Rework for PPC64 port.
18*4882a593Smuzhiyun */
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun #include <linux/kernel.h>
21*4882a593Smuzhiyun #include <linux/gfp.h>
22*4882a593Smuzhiyun #include <linux/mm.h>
23*4882a593Smuzhiyun #include <linux/percpu.h>
24*4882a593Smuzhiyun #include <linux/hardirq.h>
25*4882a593Smuzhiyun #include <linux/hugetlb.h>
26*4882a593Smuzhiyun #include <asm/tlbflush.h>
27*4882a593Smuzhiyun #include <asm/tlb.h>
28*4882a593Smuzhiyun #include <asm/hugetlb.h>
29*4882a593Smuzhiyun
is_exec_fault(void)30*4882a593Smuzhiyun static inline int is_exec_fault(void)
31*4882a593Smuzhiyun {
32*4882a593Smuzhiyun return current->thread.regs && TRAP(current->thread.regs) == 0x400;
33*4882a593Smuzhiyun }
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun /* We only try to do i/d cache coherency on stuff that looks like
36*4882a593Smuzhiyun * reasonably "normal" PTEs. We currently require a PTE to be present
37*4882a593Smuzhiyun * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
38*4882a593Smuzhiyun * on userspace PTEs
39*4882a593Smuzhiyun */
pte_looks_normal(pte_t pte)40*4882a593Smuzhiyun static inline int pte_looks_normal(pte_t pte)
41*4882a593Smuzhiyun {
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun if (pte_present(pte) && !pte_special(pte)) {
44*4882a593Smuzhiyun if (pte_ci(pte))
45*4882a593Smuzhiyun return 0;
46*4882a593Smuzhiyun if (pte_user(pte))
47*4882a593Smuzhiyun return 1;
48*4882a593Smuzhiyun }
49*4882a593Smuzhiyun return 0;
50*4882a593Smuzhiyun }
51*4882a593Smuzhiyun
maybe_pte_to_page(pte_t pte)52*4882a593Smuzhiyun static struct page *maybe_pte_to_page(pte_t pte)
53*4882a593Smuzhiyun {
54*4882a593Smuzhiyun unsigned long pfn = pte_pfn(pte);
55*4882a593Smuzhiyun struct page *page;
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun if (unlikely(!pfn_valid(pfn)))
58*4882a593Smuzhiyun return NULL;
59*4882a593Smuzhiyun page = pfn_to_page(pfn);
60*4882a593Smuzhiyun if (PageReserved(page))
61*4882a593Smuzhiyun return NULL;
62*4882a593Smuzhiyun return page;
63*4882a593Smuzhiyun }
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun #ifdef CONFIG_PPC_BOOK3S
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun /* Server-style MMU handles coherency when hashing if HW exec permission
68*4882a593Smuzhiyun * is supposed per page (currently 64-bit only). If not, then, we always
69*4882a593Smuzhiyun * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
70*4882a593Smuzhiyun * support falls into the same category.
71*4882a593Smuzhiyun */
72*4882a593Smuzhiyun
set_pte_filter_hash(pte_t pte)73*4882a593Smuzhiyun static pte_t set_pte_filter_hash(pte_t pte)
74*4882a593Smuzhiyun {
75*4882a593Smuzhiyun if (radix_enabled())
76*4882a593Smuzhiyun return pte;
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
79*4882a593Smuzhiyun if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
80*4882a593Smuzhiyun cpu_has_feature(CPU_FTR_NOEXECUTE))) {
81*4882a593Smuzhiyun struct page *pg = maybe_pte_to_page(pte);
82*4882a593Smuzhiyun if (!pg)
83*4882a593Smuzhiyun return pte;
84*4882a593Smuzhiyun if (!test_bit(PG_arch_1, &pg->flags)) {
85*4882a593Smuzhiyun flush_dcache_icache_page(pg);
86*4882a593Smuzhiyun set_bit(PG_arch_1, &pg->flags);
87*4882a593Smuzhiyun }
88*4882a593Smuzhiyun }
89*4882a593Smuzhiyun return pte;
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun #else /* CONFIG_PPC_BOOK3S */
93*4882a593Smuzhiyun
set_pte_filter_hash(pte_t pte)94*4882a593Smuzhiyun static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun #endif /* CONFIG_PPC_BOOK3S */
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun /* Embedded type MMU with HW exec support. This is a bit more complicated
99*4882a593Smuzhiyun * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
100*4882a593Smuzhiyun * instead we "filter out" the exec permission for non clean pages.
101*4882a593Smuzhiyun */
set_pte_filter(pte_t pte)102*4882a593Smuzhiyun static inline pte_t set_pte_filter(pte_t pte)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun struct page *pg;
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
107*4882a593Smuzhiyun return set_pte_filter_hash(pte);
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun /* No exec permission in the first place, move on */
110*4882a593Smuzhiyun if (!pte_exec(pte) || !pte_looks_normal(pte))
111*4882a593Smuzhiyun return pte;
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun /* If you set _PAGE_EXEC on weird pages you're on your own */
114*4882a593Smuzhiyun pg = maybe_pte_to_page(pte);
115*4882a593Smuzhiyun if (unlikely(!pg))
116*4882a593Smuzhiyun return pte;
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun /* If the page clean, we move on */
119*4882a593Smuzhiyun if (test_bit(PG_arch_1, &pg->flags))
120*4882a593Smuzhiyun return pte;
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun /* If it's an exec fault, we flush the cache and make it clean */
123*4882a593Smuzhiyun if (is_exec_fault()) {
124*4882a593Smuzhiyun flush_dcache_icache_page(pg);
125*4882a593Smuzhiyun set_bit(PG_arch_1, &pg->flags);
126*4882a593Smuzhiyun return pte;
127*4882a593Smuzhiyun }
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun /* Else, we filter out _PAGE_EXEC */
130*4882a593Smuzhiyun return pte_exprotect(pte);
131*4882a593Smuzhiyun }
132*4882a593Smuzhiyun
set_access_flags_filter(pte_t pte,struct vm_area_struct * vma,int dirty)133*4882a593Smuzhiyun static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
134*4882a593Smuzhiyun int dirty)
135*4882a593Smuzhiyun {
136*4882a593Smuzhiyun struct page *pg;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
139*4882a593Smuzhiyun return pte;
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun /* So here, we only care about exec faults, as we use them
142*4882a593Smuzhiyun * to recover lost _PAGE_EXEC and perform I$/D$ coherency
143*4882a593Smuzhiyun * if necessary. Also if _PAGE_EXEC is already set, same deal,
144*4882a593Smuzhiyun * we just bail out
145*4882a593Smuzhiyun */
146*4882a593Smuzhiyun if (dirty || pte_exec(pte) || !is_exec_fault())
147*4882a593Smuzhiyun return pte;
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun #ifdef CONFIG_DEBUG_VM
150*4882a593Smuzhiyun /* So this is an exec fault, _PAGE_EXEC is not set. If it was
151*4882a593Smuzhiyun * an error we would have bailed out earlier in do_page_fault()
152*4882a593Smuzhiyun * but let's make sure of it
153*4882a593Smuzhiyun */
154*4882a593Smuzhiyun if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
155*4882a593Smuzhiyun return pte;
156*4882a593Smuzhiyun #endif /* CONFIG_DEBUG_VM */
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun /* If you set _PAGE_EXEC on weird pages you're on your own */
159*4882a593Smuzhiyun pg = maybe_pte_to_page(pte);
160*4882a593Smuzhiyun if (unlikely(!pg))
161*4882a593Smuzhiyun goto bail;
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun /* If the page is already clean, we move on */
164*4882a593Smuzhiyun if (test_bit(PG_arch_1, &pg->flags))
165*4882a593Smuzhiyun goto bail;
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun /* Clean the page and set PG_arch_1 */
168*4882a593Smuzhiyun flush_dcache_icache_page(pg);
169*4882a593Smuzhiyun set_bit(PG_arch_1, &pg->flags);
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun bail:
172*4882a593Smuzhiyun return pte_mkexec(pte);
173*4882a593Smuzhiyun }
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun /*
176*4882a593Smuzhiyun * set_pte stores a linux PTE into the linux page table.
177*4882a593Smuzhiyun */
set_pte_at(struct mm_struct * mm,unsigned long addr,pte_t * ptep,pte_t pte)178*4882a593Smuzhiyun void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
179*4882a593Smuzhiyun pte_t pte)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun /*
182*4882a593Smuzhiyun * Make sure hardware valid bit is not set. We don't do
183*4882a593Smuzhiyun * tlb flush for this update.
184*4882a593Smuzhiyun */
185*4882a593Smuzhiyun VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun /* Note: mm->context.id might not yet have been assigned as
188*4882a593Smuzhiyun * this context might not have been activated yet when this
189*4882a593Smuzhiyun * is called.
190*4882a593Smuzhiyun */
191*4882a593Smuzhiyun pte = set_pte_filter(pte);
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun /* Perform the setting of the PTE */
194*4882a593Smuzhiyun __set_pte_at(mm, addr, ptep, pte, 0);
195*4882a593Smuzhiyun }
196*4882a593Smuzhiyun
unmap_kernel_page(unsigned long va)197*4882a593Smuzhiyun void unmap_kernel_page(unsigned long va)
198*4882a593Smuzhiyun {
199*4882a593Smuzhiyun pmd_t *pmdp = pmd_off_k(va);
200*4882a593Smuzhiyun pte_t *ptep = pte_offset_kernel(pmdp, va);
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun pte_clear(&init_mm, va, ptep);
203*4882a593Smuzhiyun flush_tlb_kernel_range(va, va + PAGE_SIZE);
204*4882a593Smuzhiyun }
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun /*
207*4882a593Smuzhiyun * This is called when relaxing access to a PTE. It's also called in the page
208*4882a593Smuzhiyun * fault path when we don't hit any of the major fault cases, ie, a minor
209*4882a593Smuzhiyun * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
210*4882a593Smuzhiyun * handled those two for us, we additionally deal with missing execute
211*4882a593Smuzhiyun * permission here on some processors
212*4882a593Smuzhiyun */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)213*4882a593Smuzhiyun int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
214*4882a593Smuzhiyun pte_t *ptep, pte_t entry, int dirty)
215*4882a593Smuzhiyun {
216*4882a593Smuzhiyun int changed;
217*4882a593Smuzhiyun entry = set_access_flags_filter(entry, vma, dirty);
218*4882a593Smuzhiyun changed = !pte_same(*(ptep), entry);
219*4882a593Smuzhiyun if (changed) {
220*4882a593Smuzhiyun assert_pte_locked(vma->vm_mm, address);
221*4882a593Smuzhiyun __ptep_set_access_flags(vma, ptep, entry,
222*4882a593Smuzhiyun address, mmu_virtual_psize);
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun return changed;
225*4882a593Smuzhiyun }
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun #ifdef CONFIG_HUGETLB_PAGE
huge_ptep_set_access_flags(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pte_t pte,int dirty)228*4882a593Smuzhiyun int huge_ptep_set_access_flags(struct vm_area_struct *vma,
229*4882a593Smuzhiyun unsigned long addr, pte_t *ptep,
230*4882a593Smuzhiyun pte_t pte, int dirty)
231*4882a593Smuzhiyun {
232*4882a593Smuzhiyun #ifdef HUGETLB_NEED_PRELOAD
233*4882a593Smuzhiyun /*
234*4882a593Smuzhiyun * The "return 1" forces a call of update_mmu_cache, which will write a
235*4882a593Smuzhiyun * TLB entry. Without this, platforms that don't do a write of the TLB
236*4882a593Smuzhiyun * entry in the TLB miss handler asm will fault ad infinitum.
237*4882a593Smuzhiyun */
238*4882a593Smuzhiyun ptep_set_access_flags(vma, addr, ptep, pte, dirty);
239*4882a593Smuzhiyun return 1;
240*4882a593Smuzhiyun #else
241*4882a593Smuzhiyun int changed, psize;
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun pte = set_access_flags_filter(pte, vma, dirty);
244*4882a593Smuzhiyun changed = !pte_same(*(ptep), pte);
245*4882a593Smuzhiyun if (changed) {
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun #ifdef CONFIG_PPC_BOOK3S_64
248*4882a593Smuzhiyun struct hstate *h = hstate_vma(vma);
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun psize = hstate_get_psize(h);
251*4882a593Smuzhiyun #ifdef CONFIG_DEBUG_VM
252*4882a593Smuzhiyun assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
253*4882a593Smuzhiyun #endif
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun #else
256*4882a593Smuzhiyun /*
257*4882a593Smuzhiyun * Not used on non book3s64 platforms.
258*4882a593Smuzhiyun * 8xx compares it with mmu_virtual_psize to
259*4882a593Smuzhiyun * know if it is a huge page or not.
260*4882a593Smuzhiyun */
261*4882a593Smuzhiyun psize = MMU_PAGE_COUNT;
262*4882a593Smuzhiyun #endif
263*4882a593Smuzhiyun __ptep_set_access_flags(vma, ptep, pte, addr, psize);
264*4882a593Smuzhiyun }
265*4882a593Smuzhiyun return changed;
266*4882a593Smuzhiyun #endif
267*4882a593Smuzhiyun }
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun #if defined(CONFIG_PPC_8xx)
set_huge_pte_at(struct mm_struct * mm,unsigned long addr,pte_t * ptep,pte_t pte)270*4882a593Smuzhiyun void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
271*4882a593Smuzhiyun {
272*4882a593Smuzhiyun pmd_t *pmd = pmd_off(mm, addr);
273*4882a593Smuzhiyun pte_basic_t val;
274*4882a593Smuzhiyun pte_basic_t *entry = &ptep->pte;
275*4882a593Smuzhiyun int num, i;
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun /*
278*4882a593Smuzhiyun * Make sure hardware valid bit is not set. We don't do
279*4882a593Smuzhiyun * tlb flush for this update.
280*4882a593Smuzhiyun */
281*4882a593Smuzhiyun VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun pte = set_pte_filter(pte);
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun val = pte_val(pte);
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun num = number_of_cells_per_pte(pmd, val, 1);
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun for (i = 0; i < num; i++, entry++, val += SZ_4K)
290*4882a593Smuzhiyun *entry = val;
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun #endif
293*4882a593Smuzhiyun #endif /* CONFIG_HUGETLB_PAGE */
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun #ifdef CONFIG_DEBUG_VM
assert_pte_locked(struct mm_struct * mm,unsigned long addr)296*4882a593Smuzhiyun void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
297*4882a593Smuzhiyun {
298*4882a593Smuzhiyun pgd_t *pgd;
299*4882a593Smuzhiyun p4d_t *p4d;
300*4882a593Smuzhiyun pud_t *pud;
301*4882a593Smuzhiyun pmd_t *pmd;
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun if (mm == &init_mm)
304*4882a593Smuzhiyun return;
305*4882a593Smuzhiyun pgd = mm->pgd + pgd_index(addr);
306*4882a593Smuzhiyun BUG_ON(pgd_none(*pgd));
307*4882a593Smuzhiyun p4d = p4d_offset(pgd, addr);
308*4882a593Smuzhiyun BUG_ON(p4d_none(*p4d));
309*4882a593Smuzhiyun pud = pud_offset(p4d, addr);
310*4882a593Smuzhiyun BUG_ON(pud_none(*pud));
311*4882a593Smuzhiyun pmd = pmd_offset(pud, addr);
312*4882a593Smuzhiyun /*
313*4882a593Smuzhiyun * khugepaged to collapse normal pages to hugepage, first set
314*4882a593Smuzhiyun * pmd to none to force page fault/gup to take mmap_lock. After
315*4882a593Smuzhiyun * pmd is set to none, we do a pte_clear which does this assertion
316*4882a593Smuzhiyun * so if we find pmd none, return.
317*4882a593Smuzhiyun */
318*4882a593Smuzhiyun if (pmd_none(*pmd))
319*4882a593Smuzhiyun return;
320*4882a593Smuzhiyun BUG_ON(!pmd_present(*pmd));
321*4882a593Smuzhiyun assert_spin_locked(pte_lockptr(mm, pmd));
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun #endif /* CONFIG_DEBUG_VM */
324*4882a593Smuzhiyun
vmalloc_to_phys(void * va)325*4882a593Smuzhiyun unsigned long vmalloc_to_phys(void *va)
326*4882a593Smuzhiyun {
327*4882a593Smuzhiyun unsigned long pfn = vmalloc_to_pfn(va);
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun BUG_ON(!pfn);
330*4882a593Smuzhiyun return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(vmalloc_to_phys);
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun /*
335*4882a593Smuzhiyun * We have 4 cases for pgds and pmds:
336*4882a593Smuzhiyun * (1) invalid (all zeroes)
337*4882a593Smuzhiyun * (2) pointer to next table, as normal; bottom 6 bits == 0
338*4882a593Smuzhiyun * (3) leaf pte for huge page _PAGE_PTE set
339*4882a593Smuzhiyun * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
340*4882a593Smuzhiyun *
341*4882a593Smuzhiyun * So long as we atomically load page table pointers we are safe against teardown,
342*4882a593Smuzhiyun * we can follow the address down to the the page and take a ref on it.
343*4882a593Smuzhiyun * This function need to be called with interrupts disabled. We use this variant
344*4882a593Smuzhiyun * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
345*4882a593Smuzhiyun */
__find_linux_pte(pgd_t * pgdir,unsigned long ea,bool * is_thp,unsigned * hpage_shift)346*4882a593Smuzhiyun pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
347*4882a593Smuzhiyun bool *is_thp, unsigned *hpage_shift)
348*4882a593Smuzhiyun {
349*4882a593Smuzhiyun pgd_t *pgdp;
350*4882a593Smuzhiyun p4d_t p4d, *p4dp;
351*4882a593Smuzhiyun pud_t pud, *pudp;
352*4882a593Smuzhiyun pmd_t pmd, *pmdp;
353*4882a593Smuzhiyun pte_t *ret_pte;
354*4882a593Smuzhiyun hugepd_t *hpdp = NULL;
355*4882a593Smuzhiyun unsigned pdshift;
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun if (hpage_shift)
358*4882a593Smuzhiyun *hpage_shift = 0;
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun if (is_thp)
361*4882a593Smuzhiyun *is_thp = false;
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun /*
364*4882a593Smuzhiyun * Always operate on the local stack value. This make sure the
365*4882a593Smuzhiyun * value don't get updated by a parallel THP split/collapse,
366*4882a593Smuzhiyun * page fault or a page unmap. The return pte_t * is still not
367*4882a593Smuzhiyun * stable. So should be checked there for above conditions.
368*4882a593Smuzhiyun * Top level is an exception because it is folded into p4d.
369*4882a593Smuzhiyun */
370*4882a593Smuzhiyun pgdp = pgdir + pgd_index(ea);
371*4882a593Smuzhiyun p4dp = p4d_offset(pgdp, ea);
372*4882a593Smuzhiyun p4d = READ_ONCE(*p4dp);
373*4882a593Smuzhiyun pdshift = P4D_SHIFT;
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun if (p4d_none(p4d))
376*4882a593Smuzhiyun return NULL;
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun if (p4d_is_leaf(p4d)) {
379*4882a593Smuzhiyun ret_pte = (pte_t *)p4dp;
380*4882a593Smuzhiyun goto out;
381*4882a593Smuzhiyun }
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun if (is_hugepd(__hugepd(p4d_val(p4d)))) {
384*4882a593Smuzhiyun hpdp = (hugepd_t *)&p4d;
385*4882a593Smuzhiyun goto out_huge;
386*4882a593Smuzhiyun }
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun /*
389*4882a593Smuzhiyun * Even if we end up with an unmap, the pgtable will not
390*4882a593Smuzhiyun * be freed, because we do an rcu free and here we are
391*4882a593Smuzhiyun * irq disabled
392*4882a593Smuzhiyun */
393*4882a593Smuzhiyun pdshift = PUD_SHIFT;
394*4882a593Smuzhiyun pudp = pud_offset(&p4d, ea);
395*4882a593Smuzhiyun pud = READ_ONCE(*pudp);
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun if (pud_none(pud))
398*4882a593Smuzhiyun return NULL;
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun if (pud_is_leaf(pud)) {
401*4882a593Smuzhiyun ret_pte = (pte_t *)pudp;
402*4882a593Smuzhiyun goto out;
403*4882a593Smuzhiyun }
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun if (is_hugepd(__hugepd(pud_val(pud)))) {
406*4882a593Smuzhiyun hpdp = (hugepd_t *)&pud;
407*4882a593Smuzhiyun goto out_huge;
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun pdshift = PMD_SHIFT;
411*4882a593Smuzhiyun pmdp = pmd_offset(&pud, ea);
412*4882a593Smuzhiyun pmd = READ_ONCE(*pmdp);
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun /*
415*4882a593Smuzhiyun * A hugepage collapse is captured by this condition, see
416*4882a593Smuzhiyun * pmdp_collapse_flush.
417*4882a593Smuzhiyun */
418*4882a593Smuzhiyun if (pmd_none(pmd))
419*4882a593Smuzhiyun return NULL;
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun #ifdef CONFIG_PPC_BOOK3S_64
422*4882a593Smuzhiyun /*
423*4882a593Smuzhiyun * A hugepage split is captured by this condition, see
424*4882a593Smuzhiyun * pmdp_invalidate.
425*4882a593Smuzhiyun *
426*4882a593Smuzhiyun * Huge page modification can be caught here too.
427*4882a593Smuzhiyun */
428*4882a593Smuzhiyun if (pmd_is_serializing(pmd))
429*4882a593Smuzhiyun return NULL;
430*4882a593Smuzhiyun #endif
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
433*4882a593Smuzhiyun if (is_thp)
434*4882a593Smuzhiyun *is_thp = true;
435*4882a593Smuzhiyun ret_pte = (pte_t *)pmdp;
436*4882a593Smuzhiyun goto out;
437*4882a593Smuzhiyun }
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun if (pmd_is_leaf(pmd)) {
440*4882a593Smuzhiyun ret_pte = (pte_t *)pmdp;
441*4882a593Smuzhiyun goto out;
442*4882a593Smuzhiyun }
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun if (is_hugepd(__hugepd(pmd_val(pmd)))) {
445*4882a593Smuzhiyun hpdp = (hugepd_t *)&pmd;
446*4882a593Smuzhiyun goto out_huge;
447*4882a593Smuzhiyun }
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun return pte_offset_kernel(&pmd, ea);
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun out_huge:
452*4882a593Smuzhiyun if (!hpdp)
453*4882a593Smuzhiyun return NULL;
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun ret_pte = hugepte_offset(*hpdp, ea, pdshift);
456*4882a593Smuzhiyun pdshift = hugepd_shift(*hpdp);
457*4882a593Smuzhiyun out:
458*4882a593Smuzhiyun if (hpage_shift)
459*4882a593Smuzhiyun *hpage_shift = pdshift;
460*4882a593Smuzhiyun return ret_pte;
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__find_linux_pte);
463