xref: /OK3568_Linux_fs/kernel/arch/s390/mm/pgalloc.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  *  Page table allocation functions
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  *    Copyright IBM Corp. 2016
6*4882a593Smuzhiyun  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7*4882a593Smuzhiyun  */
8*4882a593Smuzhiyun 
9*4882a593Smuzhiyun #include <linux/sysctl.h>
10*4882a593Smuzhiyun #include <linux/slab.h>
11*4882a593Smuzhiyun #include <linux/mm.h>
12*4882a593Smuzhiyun #include <asm/mmu_context.h>
13*4882a593Smuzhiyun #include <asm/pgalloc.h>
14*4882a593Smuzhiyun #include <asm/gmap.h>
15*4882a593Smuzhiyun #include <asm/tlb.h>
16*4882a593Smuzhiyun #include <asm/tlbflush.h>
17*4882a593Smuzhiyun 
18*4882a593Smuzhiyun #ifdef CONFIG_PGSTE
19*4882a593Smuzhiyun 
20*4882a593Smuzhiyun int page_table_allocate_pgste = 0;
21*4882a593Smuzhiyun EXPORT_SYMBOL(page_table_allocate_pgste);
22*4882a593Smuzhiyun 
23*4882a593Smuzhiyun static struct ctl_table page_table_sysctl[] = {
24*4882a593Smuzhiyun 	{
25*4882a593Smuzhiyun 		.procname	= "allocate_pgste",
26*4882a593Smuzhiyun 		.data		= &page_table_allocate_pgste,
27*4882a593Smuzhiyun 		.maxlen		= sizeof(int),
28*4882a593Smuzhiyun 		.mode		= S_IRUGO | S_IWUSR,
29*4882a593Smuzhiyun 		.proc_handler	= proc_dointvec_minmax,
30*4882a593Smuzhiyun 		.extra1		= SYSCTL_ZERO,
31*4882a593Smuzhiyun 		.extra2		= SYSCTL_ONE,
32*4882a593Smuzhiyun 	},
33*4882a593Smuzhiyun 	{ }
34*4882a593Smuzhiyun };
35*4882a593Smuzhiyun 
36*4882a593Smuzhiyun static struct ctl_table page_table_sysctl_dir[] = {
37*4882a593Smuzhiyun 	{
38*4882a593Smuzhiyun 		.procname	= "vm",
39*4882a593Smuzhiyun 		.maxlen		= 0,
40*4882a593Smuzhiyun 		.mode		= 0555,
41*4882a593Smuzhiyun 		.child		= page_table_sysctl,
42*4882a593Smuzhiyun 	},
43*4882a593Smuzhiyun 	{ }
44*4882a593Smuzhiyun };
45*4882a593Smuzhiyun 
page_table_register_sysctl(void)46*4882a593Smuzhiyun static int __init page_table_register_sysctl(void)
47*4882a593Smuzhiyun {
48*4882a593Smuzhiyun 	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
49*4882a593Smuzhiyun }
50*4882a593Smuzhiyun __initcall(page_table_register_sysctl);
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun #endif /* CONFIG_PGSTE */
53*4882a593Smuzhiyun 
crst_table_alloc(struct mm_struct * mm)54*4882a593Smuzhiyun unsigned long *crst_table_alloc(struct mm_struct *mm)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun 	struct page *page = alloc_pages(GFP_KERNEL, 2);
57*4882a593Smuzhiyun 
58*4882a593Smuzhiyun 	if (!page)
59*4882a593Smuzhiyun 		return NULL;
60*4882a593Smuzhiyun 	arch_set_page_dat(page, 2);
61*4882a593Smuzhiyun 	return (unsigned long *) page_to_phys(page);
62*4882a593Smuzhiyun }
63*4882a593Smuzhiyun 
crst_table_free(struct mm_struct * mm,unsigned long * table)64*4882a593Smuzhiyun void crst_table_free(struct mm_struct *mm, unsigned long *table)
65*4882a593Smuzhiyun {
66*4882a593Smuzhiyun 	free_pages((unsigned long) table, 2);
67*4882a593Smuzhiyun }
68*4882a593Smuzhiyun 
__crst_table_upgrade(void * arg)69*4882a593Smuzhiyun static void __crst_table_upgrade(void *arg)
70*4882a593Smuzhiyun {
71*4882a593Smuzhiyun 	struct mm_struct *mm = arg;
72*4882a593Smuzhiyun 
73*4882a593Smuzhiyun 	/* we must change all active ASCEs to avoid the creation of new TLBs */
74*4882a593Smuzhiyun 	if (current->active_mm == mm) {
75*4882a593Smuzhiyun 		S390_lowcore.user_asce = mm->context.asce;
76*4882a593Smuzhiyun 		if (current->thread.mm_segment == USER_DS) {
77*4882a593Smuzhiyun 			__ctl_load(S390_lowcore.user_asce, 1, 1);
78*4882a593Smuzhiyun 			/* Mark user-ASCE present in CR1 */
79*4882a593Smuzhiyun 			clear_cpu_flag(CIF_ASCE_PRIMARY);
80*4882a593Smuzhiyun 		}
81*4882a593Smuzhiyun 		if (current->thread.mm_segment == USER_DS_SACF) {
82*4882a593Smuzhiyun 			__ctl_load(S390_lowcore.user_asce, 7, 7);
83*4882a593Smuzhiyun 			/* enable_sacf_uaccess does all or nothing */
84*4882a593Smuzhiyun 			WARN_ON(!test_cpu_flag(CIF_ASCE_SECONDARY));
85*4882a593Smuzhiyun 		}
86*4882a593Smuzhiyun 	}
87*4882a593Smuzhiyun 	__tlb_flush_local();
88*4882a593Smuzhiyun }
89*4882a593Smuzhiyun 
crst_table_upgrade(struct mm_struct * mm,unsigned long end)90*4882a593Smuzhiyun int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
91*4882a593Smuzhiyun {
92*4882a593Smuzhiyun 	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
93*4882a593Smuzhiyun 	unsigned long asce_limit = mm->context.asce_limit;
94*4882a593Smuzhiyun 
95*4882a593Smuzhiyun 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
96*4882a593Smuzhiyun 	VM_BUG_ON(asce_limit < _REGION2_SIZE);
97*4882a593Smuzhiyun 
98*4882a593Smuzhiyun 	if (end <= asce_limit)
99*4882a593Smuzhiyun 		return 0;
100*4882a593Smuzhiyun 
101*4882a593Smuzhiyun 	if (asce_limit == _REGION2_SIZE) {
102*4882a593Smuzhiyun 		p4d = crst_table_alloc(mm);
103*4882a593Smuzhiyun 		if (unlikely(!p4d))
104*4882a593Smuzhiyun 			goto err_p4d;
105*4882a593Smuzhiyun 		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
106*4882a593Smuzhiyun 	}
107*4882a593Smuzhiyun 	if (end > _REGION1_SIZE) {
108*4882a593Smuzhiyun 		pgd = crst_table_alloc(mm);
109*4882a593Smuzhiyun 		if (unlikely(!pgd))
110*4882a593Smuzhiyun 			goto err_pgd;
111*4882a593Smuzhiyun 		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
112*4882a593Smuzhiyun 	}
113*4882a593Smuzhiyun 
114*4882a593Smuzhiyun 	spin_lock_bh(&mm->page_table_lock);
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun 	/*
117*4882a593Smuzhiyun 	 * This routine gets called with mmap_lock lock held and there is
118*4882a593Smuzhiyun 	 * no reason to optimize for the case of otherwise. However, if
119*4882a593Smuzhiyun 	 * that would ever change, the below check will let us know.
120*4882a593Smuzhiyun 	 */
121*4882a593Smuzhiyun 	VM_BUG_ON(asce_limit != mm->context.asce_limit);
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	if (p4d) {
124*4882a593Smuzhiyun 		__pgd = (unsigned long *) mm->pgd;
125*4882a593Smuzhiyun 		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
126*4882a593Smuzhiyun 		mm->pgd = (pgd_t *) p4d;
127*4882a593Smuzhiyun 		mm->context.asce_limit = _REGION1_SIZE;
128*4882a593Smuzhiyun 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
129*4882a593Smuzhiyun 			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
130*4882a593Smuzhiyun 		mm_inc_nr_puds(mm);
131*4882a593Smuzhiyun 	}
132*4882a593Smuzhiyun 	if (pgd) {
133*4882a593Smuzhiyun 		__pgd = (unsigned long *) mm->pgd;
134*4882a593Smuzhiyun 		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
135*4882a593Smuzhiyun 		mm->pgd = (pgd_t *) pgd;
136*4882a593Smuzhiyun 		mm->context.asce_limit = TASK_SIZE_MAX;
137*4882a593Smuzhiyun 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
138*4882a593Smuzhiyun 			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
139*4882a593Smuzhiyun 	}
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun 	spin_unlock_bh(&mm->page_table_lock);
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	on_each_cpu(__crst_table_upgrade, mm, 0);
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun 	return 0;
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun err_pgd:
148*4882a593Smuzhiyun 	crst_table_free(mm, p4d);
149*4882a593Smuzhiyun err_p4d:
150*4882a593Smuzhiyun 	return -ENOMEM;
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun 
atomic_xor_bits(atomic_t * v,unsigned int bits)153*4882a593Smuzhiyun static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
154*4882a593Smuzhiyun {
155*4882a593Smuzhiyun 	unsigned int old, new;
156*4882a593Smuzhiyun 
157*4882a593Smuzhiyun 	do {
158*4882a593Smuzhiyun 		old = atomic_read(v);
159*4882a593Smuzhiyun 		new = old ^ bits;
160*4882a593Smuzhiyun 	} while (atomic_cmpxchg(v, old, new) != old);
161*4882a593Smuzhiyun 	return new;
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun 
164*4882a593Smuzhiyun #ifdef CONFIG_PGSTE
165*4882a593Smuzhiyun 
page_table_alloc_pgste(struct mm_struct * mm)166*4882a593Smuzhiyun struct page *page_table_alloc_pgste(struct mm_struct *mm)
167*4882a593Smuzhiyun {
168*4882a593Smuzhiyun 	struct page *page;
169*4882a593Smuzhiyun 	u64 *table;
170*4882a593Smuzhiyun 
171*4882a593Smuzhiyun 	page = alloc_page(GFP_KERNEL);
172*4882a593Smuzhiyun 	if (page) {
173*4882a593Smuzhiyun 		table = (u64 *)page_to_phys(page);
174*4882a593Smuzhiyun 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
175*4882a593Smuzhiyun 		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
176*4882a593Smuzhiyun 	}
177*4882a593Smuzhiyun 	return page;
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun 
page_table_free_pgste(struct page * page)180*4882a593Smuzhiyun void page_table_free_pgste(struct page *page)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun 	__free_page(page);
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun 
185*4882a593Smuzhiyun #endif /* CONFIG_PGSTE */
186*4882a593Smuzhiyun 
187*4882a593Smuzhiyun /*
188*4882a593Smuzhiyun  * page table entry allocation/free routines.
189*4882a593Smuzhiyun  */
page_table_alloc(struct mm_struct * mm)190*4882a593Smuzhiyun unsigned long *page_table_alloc(struct mm_struct *mm)
191*4882a593Smuzhiyun {
192*4882a593Smuzhiyun 	unsigned long *table;
193*4882a593Smuzhiyun 	struct page *page;
194*4882a593Smuzhiyun 	unsigned int mask, bit;
195*4882a593Smuzhiyun 
196*4882a593Smuzhiyun 	/* Try to get a fragment of a 4K page as a 2K page table */
197*4882a593Smuzhiyun 	if (!mm_alloc_pgste(mm)) {
198*4882a593Smuzhiyun 		table = NULL;
199*4882a593Smuzhiyun 		spin_lock_bh(&mm->context.lock);
200*4882a593Smuzhiyun 		if (!list_empty(&mm->context.pgtable_list)) {
201*4882a593Smuzhiyun 			page = list_first_entry(&mm->context.pgtable_list,
202*4882a593Smuzhiyun 						struct page, lru);
203*4882a593Smuzhiyun 			mask = atomic_read(&page->_refcount) >> 24;
204*4882a593Smuzhiyun 			mask = (mask | (mask >> 4)) & 3;
205*4882a593Smuzhiyun 			if (mask != 3) {
206*4882a593Smuzhiyun 				table = (unsigned long *) page_to_phys(page);
207*4882a593Smuzhiyun 				bit = mask & 1;		/* =1 -> second 2K */
208*4882a593Smuzhiyun 				if (bit)
209*4882a593Smuzhiyun 					table += PTRS_PER_PTE;
210*4882a593Smuzhiyun 				atomic_xor_bits(&page->_refcount,
211*4882a593Smuzhiyun 							1U << (bit + 24));
212*4882a593Smuzhiyun 				list_del(&page->lru);
213*4882a593Smuzhiyun 			}
214*4882a593Smuzhiyun 		}
215*4882a593Smuzhiyun 		spin_unlock_bh(&mm->context.lock);
216*4882a593Smuzhiyun 		if (table)
217*4882a593Smuzhiyun 			return table;
218*4882a593Smuzhiyun 	}
219*4882a593Smuzhiyun 	/* Allocate a fresh page */
220*4882a593Smuzhiyun 	page = alloc_page(GFP_KERNEL);
221*4882a593Smuzhiyun 	if (!page)
222*4882a593Smuzhiyun 		return NULL;
223*4882a593Smuzhiyun 	if (!pgtable_pte_page_ctor(page)) {
224*4882a593Smuzhiyun 		__free_page(page);
225*4882a593Smuzhiyun 		return NULL;
226*4882a593Smuzhiyun 	}
227*4882a593Smuzhiyun 	arch_set_page_dat(page, 0);
228*4882a593Smuzhiyun 	/* Initialize page table */
229*4882a593Smuzhiyun 	table = (unsigned long *) page_to_phys(page);
230*4882a593Smuzhiyun 	if (mm_alloc_pgste(mm)) {
231*4882a593Smuzhiyun 		/* Return 4K page table with PGSTEs */
232*4882a593Smuzhiyun 		atomic_xor_bits(&page->_refcount, 3 << 24);
233*4882a593Smuzhiyun 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
234*4882a593Smuzhiyun 		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
235*4882a593Smuzhiyun 	} else {
236*4882a593Smuzhiyun 		/* Return the first 2K fragment of the page */
237*4882a593Smuzhiyun 		atomic_xor_bits(&page->_refcount, 1 << 24);
238*4882a593Smuzhiyun 		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
239*4882a593Smuzhiyun 		spin_lock_bh(&mm->context.lock);
240*4882a593Smuzhiyun 		list_add(&page->lru, &mm->context.pgtable_list);
241*4882a593Smuzhiyun 		spin_unlock_bh(&mm->context.lock);
242*4882a593Smuzhiyun 	}
243*4882a593Smuzhiyun 	return table;
244*4882a593Smuzhiyun }
245*4882a593Smuzhiyun 
page_table_free(struct mm_struct * mm,unsigned long * table)246*4882a593Smuzhiyun void page_table_free(struct mm_struct *mm, unsigned long *table)
247*4882a593Smuzhiyun {
248*4882a593Smuzhiyun 	struct page *page;
249*4882a593Smuzhiyun 	unsigned int bit, mask;
250*4882a593Smuzhiyun 
251*4882a593Smuzhiyun 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
252*4882a593Smuzhiyun 	if (!mm_alloc_pgste(mm)) {
253*4882a593Smuzhiyun 		/* Free 2K page table fragment of a 4K page */
254*4882a593Smuzhiyun 		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
255*4882a593Smuzhiyun 		spin_lock_bh(&mm->context.lock);
256*4882a593Smuzhiyun 		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
257*4882a593Smuzhiyun 		mask >>= 24;
258*4882a593Smuzhiyun 		if (mask & 3)
259*4882a593Smuzhiyun 			list_add(&page->lru, &mm->context.pgtable_list);
260*4882a593Smuzhiyun 		else
261*4882a593Smuzhiyun 			list_del(&page->lru);
262*4882a593Smuzhiyun 		spin_unlock_bh(&mm->context.lock);
263*4882a593Smuzhiyun 		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
264*4882a593Smuzhiyun 		mask >>= 24;
265*4882a593Smuzhiyun 		if (mask != 0)
266*4882a593Smuzhiyun 			return;
267*4882a593Smuzhiyun 	} else {
268*4882a593Smuzhiyun 		atomic_xor_bits(&page->_refcount, 3U << 24);
269*4882a593Smuzhiyun 	}
270*4882a593Smuzhiyun 
271*4882a593Smuzhiyun 	pgtable_pte_page_dtor(page);
272*4882a593Smuzhiyun 	__free_page(page);
273*4882a593Smuzhiyun }
274*4882a593Smuzhiyun 
page_table_free_rcu(struct mmu_gather * tlb,unsigned long * table,unsigned long vmaddr)275*4882a593Smuzhiyun void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
276*4882a593Smuzhiyun 			 unsigned long vmaddr)
277*4882a593Smuzhiyun {
278*4882a593Smuzhiyun 	struct mm_struct *mm;
279*4882a593Smuzhiyun 	struct page *page;
280*4882a593Smuzhiyun 	unsigned int bit, mask;
281*4882a593Smuzhiyun 
282*4882a593Smuzhiyun 	mm = tlb->mm;
283*4882a593Smuzhiyun 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
284*4882a593Smuzhiyun 	if (mm_alloc_pgste(mm)) {
285*4882a593Smuzhiyun 		gmap_unlink(mm, table, vmaddr);
286*4882a593Smuzhiyun 		table = (unsigned long *) (__pa(table) | 3);
287*4882a593Smuzhiyun 		tlb_remove_table(tlb, table);
288*4882a593Smuzhiyun 		return;
289*4882a593Smuzhiyun 	}
290*4882a593Smuzhiyun 	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
291*4882a593Smuzhiyun 	spin_lock_bh(&mm->context.lock);
292*4882a593Smuzhiyun 	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
293*4882a593Smuzhiyun 	mask >>= 24;
294*4882a593Smuzhiyun 	if (mask & 3)
295*4882a593Smuzhiyun 		list_add_tail(&page->lru, &mm->context.pgtable_list);
296*4882a593Smuzhiyun 	else
297*4882a593Smuzhiyun 		list_del(&page->lru);
298*4882a593Smuzhiyun 	spin_unlock_bh(&mm->context.lock);
299*4882a593Smuzhiyun 	table = (unsigned long *) (__pa(table) | (1U << bit));
300*4882a593Smuzhiyun 	tlb_remove_table(tlb, table);
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun 
__tlb_remove_table(void * _table)303*4882a593Smuzhiyun void __tlb_remove_table(void *_table)
304*4882a593Smuzhiyun {
305*4882a593Smuzhiyun 	unsigned int mask = (unsigned long) _table & 3;
306*4882a593Smuzhiyun 	void *table = (void *)((unsigned long) _table ^ mask);
307*4882a593Smuzhiyun 	struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
308*4882a593Smuzhiyun 
309*4882a593Smuzhiyun 	switch (mask) {
310*4882a593Smuzhiyun 	case 0:		/* pmd, pud, or p4d */
311*4882a593Smuzhiyun 		free_pages((unsigned long) table, 2);
312*4882a593Smuzhiyun 		break;
313*4882a593Smuzhiyun 	case 1:		/* lower 2K of a 4K page table */
314*4882a593Smuzhiyun 	case 2:		/* higher 2K of a 4K page table */
315*4882a593Smuzhiyun 		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
316*4882a593Smuzhiyun 		mask >>= 24;
317*4882a593Smuzhiyun 		if (mask != 0)
318*4882a593Smuzhiyun 			break;
319*4882a593Smuzhiyun 		fallthrough;
320*4882a593Smuzhiyun 	case 3:		/* 4K page table with pgstes */
321*4882a593Smuzhiyun 		if (mask & 3)
322*4882a593Smuzhiyun 			atomic_xor_bits(&page->_refcount, 3 << 24);
323*4882a593Smuzhiyun 		pgtable_pte_page_dtor(page);
324*4882a593Smuzhiyun 		__free_page(page);
325*4882a593Smuzhiyun 		break;
326*4882a593Smuzhiyun 	}
327*4882a593Smuzhiyun }
328*4882a593Smuzhiyun 
329*4882a593Smuzhiyun /*
330*4882a593Smuzhiyun  * Base infrastructure required to generate basic asces, region, segment,
331*4882a593Smuzhiyun  * and page tables that do not make use of enhanced features like EDAT1.
332*4882a593Smuzhiyun  */
333*4882a593Smuzhiyun 
334*4882a593Smuzhiyun static struct kmem_cache *base_pgt_cache;
335*4882a593Smuzhiyun 
base_pgt_alloc(void)336*4882a593Smuzhiyun static unsigned long base_pgt_alloc(void)
337*4882a593Smuzhiyun {
338*4882a593Smuzhiyun 	u64 *table;
339*4882a593Smuzhiyun 
340*4882a593Smuzhiyun 	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
341*4882a593Smuzhiyun 	if (table)
342*4882a593Smuzhiyun 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
343*4882a593Smuzhiyun 	return (unsigned long) table;
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun 
base_pgt_free(unsigned long table)346*4882a593Smuzhiyun static void base_pgt_free(unsigned long table)
347*4882a593Smuzhiyun {
348*4882a593Smuzhiyun 	kmem_cache_free(base_pgt_cache, (void *) table);
349*4882a593Smuzhiyun }
350*4882a593Smuzhiyun 
base_crst_alloc(unsigned long val)351*4882a593Smuzhiyun static unsigned long base_crst_alloc(unsigned long val)
352*4882a593Smuzhiyun {
353*4882a593Smuzhiyun 	unsigned long table;
354*4882a593Smuzhiyun 
355*4882a593Smuzhiyun 	table =	 __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
356*4882a593Smuzhiyun 	if (table)
357*4882a593Smuzhiyun 		crst_table_init((unsigned long *)table, val);
358*4882a593Smuzhiyun 	return table;
359*4882a593Smuzhiyun }
360*4882a593Smuzhiyun 
base_crst_free(unsigned long table)361*4882a593Smuzhiyun static void base_crst_free(unsigned long table)
362*4882a593Smuzhiyun {
363*4882a593Smuzhiyun 	free_pages(table, CRST_ALLOC_ORDER);
364*4882a593Smuzhiyun }
365*4882a593Smuzhiyun 
366*4882a593Smuzhiyun #define BASE_ADDR_END_FUNC(NAME, SIZE)					\
367*4882a593Smuzhiyun static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
368*4882a593Smuzhiyun 						   unsigned long end)	\
369*4882a593Smuzhiyun {									\
370*4882a593Smuzhiyun 	unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);		\
371*4882a593Smuzhiyun 									\
372*4882a593Smuzhiyun 	return (next - 1) < (end - 1) ? next : end;			\
373*4882a593Smuzhiyun }
374*4882a593Smuzhiyun 
BASE_ADDR_END_FUNC(page,_PAGE_SIZE)375*4882a593Smuzhiyun BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
376*4882a593Smuzhiyun BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
377*4882a593Smuzhiyun BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
378*4882a593Smuzhiyun BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
379*4882a593Smuzhiyun BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
380*4882a593Smuzhiyun 
381*4882a593Smuzhiyun static inline unsigned long base_lra(unsigned long address)
382*4882a593Smuzhiyun {
383*4882a593Smuzhiyun 	unsigned long real;
384*4882a593Smuzhiyun 
385*4882a593Smuzhiyun 	asm volatile(
386*4882a593Smuzhiyun 		"	lra	%0,0(%1)\n"
387*4882a593Smuzhiyun 		: "=d" (real) : "a" (address) : "cc");
388*4882a593Smuzhiyun 	return real;
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun 
base_page_walk(unsigned long origin,unsigned long addr,unsigned long end,int alloc)391*4882a593Smuzhiyun static int base_page_walk(unsigned long origin, unsigned long addr,
392*4882a593Smuzhiyun 			  unsigned long end, int alloc)
393*4882a593Smuzhiyun {
394*4882a593Smuzhiyun 	unsigned long *pte, next;
395*4882a593Smuzhiyun 
396*4882a593Smuzhiyun 	if (!alloc)
397*4882a593Smuzhiyun 		return 0;
398*4882a593Smuzhiyun 	pte = (unsigned long *) origin;
399*4882a593Smuzhiyun 	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
400*4882a593Smuzhiyun 	do {
401*4882a593Smuzhiyun 		next = base_page_addr_end(addr, end);
402*4882a593Smuzhiyun 		*pte = base_lra(addr);
403*4882a593Smuzhiyun 	} while (pte++, addr = next, addr < end);
404*4882a593Smuzhiyun 	return 0;
405*4882a593Smuzhiyun }
406*4882a593Smuzhiyun 
base_segment_walk(unsigned long origin,unsigned long addr,unsigned long end,int alloc)407*4882a593Smuzhiyun static int base_segment_walk(unsigned long origin, unsigned long addr,
408*4882a593Smuzhiyun 			     unsigned long end, int alloc)
409*4882a593Smuzhiyun {
410*4882a593Smuzhiyun 	unsigned long *ste, next, table;
411*4882a593Smuzhiyun 	int rc;
412*4882a593Smuzhiyun 
413*4882a593Smuzhiyun 	ste = (unsigned long *) origin;
414*4882a593Smuzhiyun 	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
415*4882a593Smuzhiyun 	do {
416*4882a593Smuzhiyun 		next = base_segment_addr_end(addr, end);
417*4882a593Smuzhiyun 		if (*ste & _SEGMENT_ENTRY_INVALID) {
418*4882a593Smuzhiyun 			if (!alloc)
419*4882a593Smuzhiyun 				continue;
420*4882a593Smuzhiyun 			table = base_pgt_alloc();
421*4882a593Smuzhiyun 			if (!table)
422*4882a593Smuzhiyun 				return -ENOMEM;
423*4882a593Smuzhiyun 			*ste = table | _SEGMENT_ENTRY;
424*4882a593Smuzhiyun 		}
425*4882a593Smuzhiyun 		table = *ste & _SEGMENT_ENTRY_ORIGIN;
426*4882a593Smuzhiyun 		rc = base_page_walk(table, addr, next, alloc);
427*4882a593Smuzhiyun 		if (rc)
428*4882a593Smuzhiyun 			return rc;
429*4882a593Smuzhiyun 		if (!alloc)
430*4882a593Smuzhiyun 			base_pgt_free(table);
431*4882a593Smuzhiyun 		cond_resched();
432*4882a593Smuzhiyun 	} while (ste++, addr = next, addr < end);
433*4882a593Smuzhiyun 	return 0;
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun 
base_region3_walk(unsigned long origin,unsigned long addr,unsigned long end,int alloc)436*4882a593Smuzhiyun static int base_region3_walk(unsigned long origin, unsigned long addr,
437*4882a593Smuzhiyun 			     unsigned long end, int alloc)
438*4882a593Smuzhiyun {
439*4882a593Smuzhiyun 	unsigned long *rtte, next, table;
440*4882a593Smuzhiyun 	int rc;
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun 	rtte = (unsigned long *) origin;
443*4882a593Smuzhiyun 	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
444*4882a593Smuzhiyun 	do {
445*4882a593Smuzhiyun 		next = base_region3_addr_end(addr, end);
446*4882a593Smuzhiyun 		if (*rtte & _REGION_ENTRY_INVALID) {
447*4882a593Smuzhiyun 			if (!alloc)
448*4882a593Smuzhiyun 				continue;
449*4882a593Smuzhiyun 			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
450*4882a593Smuzhiyun 			if (!table)
451*4882a593Smuzhiyun 				return -ENOMEM;
452*4882a593Smuzhiyun 			*rtte = table | _REGION3_ENTRY;
453*4882a593Smuzhiyun 		}
454*4882a593Smuzhiyun 		table = *rtte & _REGION_ENTRY_ORIGIN;
455*4882a593Smuzhiyun 		rc = base_segment_walk(table, addr, next, alloc);
456*4882a593Smuzhiyun 		if (rc)
457*4882a593Smuzhiyun 			return rc;
458*4882a593Smuzhiyun 		if (!alloc)
459*4882a593Smuzhiyun 			base_crst_free(table);
460*4882a593Smuzhiyun 	} while (rtte++, addr = next, addr < end);
461*4882a593Smuzhiyun 	return 0;
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun 
base_region2_walk(unsigned long origin,unsigned long addr,unsigned long end,int alloc)464*4882a593Smuzhiyun static int base_region2_walk(unsigned long origin, unsigned long addr,
465*4882a593Smuzhiyun 			     unsigned long end, int alloc)
466*4882a593Smuzhiyun {
467*4882a593Smuzhiyun 	unsigned long *rste, next, table;
468*4882a593Smuzhiyun 	int rc;
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun 	rste = (unsigned long *) origin;
471*4882a593Smuzhiyun 	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
472*4882a593Smuzhiyun 	do {
473*4882a593Smuzhiyun 		next = base_region2_addr_end(addr, end);
474*4882a593Smuzhiyun 		if (*rste & _REGION_ENTRY_INVALID) {
475*4882a593Smuzhiyun 			if (!alloc)
476*4882a593Smuzhiyun 				continue;
477*4882a593Smuzhiyun 			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
478*4882a593Smuzhiyun 			if (!table)
479*4882a593Smuzhiyun 				return -ENOMEM;
480*4882a593Smuzhiyun 			*rste = table | _REGION2_ENTRY;
481*4882a593Smuzhiyun 		}
482*4882a593Smuzhiyun 		table = *rste & _REGION_ENTRY_ORIGIN;
483*4882a593Smuzhiyun 		rc = base_region3_walk(table, addr, next, alloc);
484*4882a593Smuzhiyun 		if (rc)
485*4882a593Smuzhiyun 			return rc;
486*4882a593Smuzhiyun 		if (!alloc)
487*4882a593Smuzhiyun 			base_crst_free(table);
488*4882a593Smuzhiyun 	} while (rste++, addr = next, addr < end);
489*4882a593Smuzhiyun 	return 0;
490*4882a593Smuzhiyun }
491*4882a593Smuzhiyun 
base_region1_walk(unsigned long origin,unsigned long addr,unsigned long end,int alloc)492*4882a593Smuzhiyun static int base_region1_walk(unsigned long origin, unsigned long addr,
493*4882a593Smuzhiyun 			     unsigned long end, int alloc)
494*4882a593Smuzhiyun {
495*4882a593Smuzhiyun 	unsigned long *rfte, next, table;
496*4882a593Smuzhiyun 	int rc;
497*4882a593Smuzhiyun 
498*4882a593Smuzhiyun 	rfte = (unsigned long *) origin;
499*4882a593Smuzhiyun 	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
500*4882a593Smuzhiyun 	do {
501*4882a593Smuzhiyun 		next = base_region1_addr_end(addr, end);
502*4882a593Smuzhiyun 		if (*rfte & _REGION_ENTRY_INVALID) {
503*4882a593Smuzhiyun 			if (!alloc)
504*4882a593Smuzhiyun 				continue;
505*4882a593Smuzhiyun 			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
506*4882a593Smuzhiyun 			if (!table)
507*4882a593Smuzhiyun 				return -ENOMEM;
508*4882a593Smuzhiyun 			*rfte = table | _REGION1_ENTRY;
509*4882a593Smuzhiyun 		}
510*4882a593Smuzhiyun 		table = *rfte & _REGION_ENTRY_ORIGIN;
511*4882a593Smuzhiyun 		rc = base_region2_walk(table, addr, next, alloc);
512*4882a593Smuzhiyun 		if (rc)
513*4882a593Smuzhiyun 			return rc;
514*4882a593Smuzhiyun 		if (!alloc)
515*4882a593Smuzhiyun 			base_crst_free(table);
516*4882a593Smuzhiyun 	} while (rfte++, addr = next, addr < end);
517*4882a593Smuzhiyun 	return 0;
518*4882a593Smuzhiyun }
519*4882a593Smuzhiyun 
520*4882a593Smuzhiyun /**
521*4882a593Smuzhiyun  * base_asce_free - free asce and tables returned from base_asce_alloc()
522*4882a593Smuzhiyun  * @asce: asce to be freed
523*4882a593Smuzhiyun  *
524*4882a593Smuzhiyun  * Frees all region, segment, and page tables that were allocated with a
525*4882a593Smuzhiyun  * corresponding base_asce_alloc() call.
526*4882a593Smuzhiyun  */
base_asce_free(unsigned long asce)527*4882a593Smuzhiyun void base_asce_free(unsigned long asce)
528*4882a593Smuzhiyun {
529*4882a593Smuzhiyun 	unsigned long table = asce & _ASCE_ORIGIN;
530*4882a593Smuzhiyun 
531*4882a593Smuzhiyun 	if (!asce)
532*4882a593Smuzhiyun 		return;
533*4882a593Smuzhiyun 	switch (asce & _ASCE_TYPE_MASK) {
534*4882a593Smuzhiyun 	case _ASCE_TYPE_SEGMENT:
535*4882a593Smuzhiyun 		base_segment_walk(table, 0, _REGION3_SIZE, 0);
536*4882a593Smuzhiyun 		break;
537*4882a593Smuzhiyun 	case _ASCE_TYPE_REGION3:
538*4882a593Smuzhiyun 		base_region3_walk(table, 0, _REGION2_SIZE, 0);
539*4882a593Smuzhiyun 		break;
540*4882a593Smuzhiyun 	case _ASCE_TYPE_REGION2:
541*4882a593Smuzhiyun 		base_region2_walk(table, 0, _REGION1_SIZE, 0);
542*4882a593Smuzhiyun 		break;
543*4882a593Smuzhiyun 	case _ASCE_TYPE_REGION1:
544*4882a593Smuzhiyun 		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
545*4882a593Smuzhiyun 		break;
546*4882a593Smuzhiyun 	}
547*4882a593Smuzhiyun 	base_crst_free(table);
548*4882a593Smuzhiyun }
549*4882a593Smuzhiyun 
base_pgt_cache_init(void)550*4882a593Smuzhiyun static int base_pgt_cache_init(void)
551*4882a593Smuzhiyun {
552*4882a593Smuzhiyun 	static DEFINE_MUTEX(base_pgt_cache_mutex);
553*4882a593Smuzhiyun 	unsigned long sz = _PAGE_TABLE_SIZE;
554*4882a593Smuzhiyun 
555*4882a593Smuzhiyun 	if (base_pgt_cache)
556*4882a593Smuzhiyun 		return 0;
557*4882a593Smuzhiyun 	mutex_lock(&base_pgt_cache_mutex);
558*4882a593Smuzhiyun 	if (!base_pgt_cache)
559*4882a593Smuzhiyun 		base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
560*4882a593Smuzhiyun 	mutex_unlock(&base_pgt_cache_mutex);
561*4882a593Smuzhiyun 	return base_pgt_cache ? 0 : -ENOMEM;
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun 
564*4882a593Smuzhiyun /**
565*4882a593Smuzhiyun  * base_asce_alloc - create kernel mapping without enhanced DAT features
566*4882a593Smuzhiyun  * @addr: virtual start address of kernel mapping
567*4882a593Smuzhiyun  * @num_pages: number of consecutive pages
568*4882a593Smuzhiyun  *
569*4882a593Smuzhiyun  * Generate an asce, including all required region, segment and page tables,
570*4882a593Smuzhiyun  * that can be used to access the virtual kernel mapping. The difference is
571*4882a593Smuzhiyun  * that the returned asce does not make use of any enhanced DAT features like
572*4882a593Smuzhiyun  * e.g. large pages. This is required for some I/O functions that pass an
573*4882a593Smuzhiyun  * asce, like e.g. some service call requests.
574*4882a593Smuzhiyun  *
575*4882a593Smuzhiyun  * Note: the returned asce may NEVER be attached to any cpu. It may only be
576*4882a593Smuzhiyun  *	 used for I/O requests. tlb entries that might result because the
577*4882a593Smuzhiyun  *	 asce was attached to a cpu won't be cleared.
578*4882a593Smuzhiyun  */
base_asce_alloc(unsigned long addr,unsigned long num_pages)579*4882a593Smuzhiyun unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
580*4882a593Smuzhiyun {
581*4882a593Smuzhiyun 	unsigned long asce, table, end;
582*4882a593Smuzhiyun 	int rc;
583*4882a593Smuzhiyun 
584*4882a593Smuzhiyun 	if (base_pgt_cache_init())
585*4882a593Smuzhiyun 		return 0;
586*4882a593Smuzhiyun 	end = addr + num_pages * PAGE_SIZE;
587*4882a593Smuzhiyun 	if (end <= _REGION3_SIZE) {
588*4882a593Smuzhiyun 		table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
589*4882a593Smuzhiyun 		if (!table)
590*4882a593Smuzhiyun 			return 0;
591*4882a593Smuzhiyun 		rc = base_segment_walk(table, addr, end, 1);
592*4882a593Smuzhiyun 		asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
593*4882a593Smuzhiyun 	} else if (end <= _REGION2_SIZE) {
594*4882a593Smuzhiyun 		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
595*4882a593Smuzhiyun 		if (!table)
596*4882a593Smuzhiyun 			return 0;
597*4882a593Smuzhiyun 		rc = base_region3_walk(table, addr, end, 1);
598*4882a593Smuzhiyun 		asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
599*4882a593Smuzhiyun 	} else if (end <= _REGION1_SIZE) {
600*4882a593Smuzhiyun 		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
601*4882a593Smuzhiyun 		if (!table)
602*4882a593Smuzhiyun 			return 0;
603*4882a593Smuzhiyun 		rc = base_region2_walk(table, addr, end, 1);
604*4882a593Smuzhiyun 		asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
605*4882a593Smuzhiyun 	} else {
606*4882a593Smuzhiyun 		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
607*4882a593Smuzhiyun 		if (!table)
608*4882a593Smuzhiyun 			return 0;
609*4882a593Smuzhiyun 		rc = base_region1_walk(table, addr, end, 1);
610*4882a593Smuzhiyun 		asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
611*4882a593Smuzhiyun 	}
612*4882a593Smuzhiyun 	if (rc) {
613*4882a593Smuzhiyun 		base_asce_free(asce);
614*4882a593Smuzhiyun 		asce = 0;
615*4882a593Smuzhiyun 	}
616*4882a593Smuzhiyun 	return asce;
617*4882a593Smuzhiyun }
618