xref: /OK3568_Linux_fs/kernel/arch/powerpc/kernel/vector.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun#include <asm/processor.h>
3*4882a593Smuzhiyun#include <asm/ppc_asm.h>
4*4882a593Smuzhiyun#include <asm/reg.h>
5*4882a593Smuzhiyun#include <asm/asm-offsets.h>
6*4882a593Smuzhiyun#include <asm/cputable.h>
7*4882a593Smuzhiyun#include <asm/thread_info.h>
8*4882a593Smuzhiyun#include <asm/page.h>
9*4882a593Smuzhiyun#include <asm/ptrace.h>
10*4882a593Smuzhiyun#include <asm/export.h>
11*4882a593Smuzhiyun#include <asm/asm-compat.h>
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun/*
14*4882a593Smuzhiyun * Load state from memory into VMX registers including VSCR.
15*4882a593Smuzhiyun * Assumes the caller has enabled VMX in the MSR.
16*4882a593Smuzhiyun */
17*4882a593Smuzhiyun_GLOBAL(load_vr_state)
18*4882a593Smuzhiyun	li	r4,VRSTATE_VSCR
19*4882a593Smuzhiyun	lvx	v0,r4,r3
20*4882a593Smuzhiyun	mtvscr	v0
21*4882a593Smuzhiyun	REST_32VRS(0,r4,r3)
22*4882a593Smuzhiyun	blr
23*4882a593SmuzhiyunEXPORT_SYMBOL(load_vr_state)
24*4882a593Smuzhiyun_ASM_NOKPROBE_SYMBOL(load_vr_state); /* used by restore_math */
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun/*
27*4882a593Smuzhiyun * Store VMX state into memory, including VSCR.
28*4882a593Smuzhiyun * Assumes the caller has enabled VMX in the MSR.
29*4882a593Smuzhiyun */
30*4882a593Smuzhiyun_GLOBAL(store_vr_state)
31*4882a593Smuzhiyun	SAVE_32VRS(0, r4, r3)
32*4882a593Smuzhiyun	mfvscr	v0
33*4882a593Smuzhiyun	li	r4, VRSTATE_VSCR
34*4882a593Smuzhiyun	stvx	v0, r4, r3
35*4882a593Smuzhiyun	blr
36*4882a593SmuzhiyunEXPORT_SYMBOL(store_vr_state)
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun/*
39*4882a593Smuzhiyun * Disable VMX for the task which had it previously,
40*4882a593Smuzhiyun * and save its vector registers in its thread_struct.
41*4882a593Smuzhiyun * Enables the VMX for use in the kernel on return.
42*4882a593Smuzhiyun * On SMP we know the VMX is free, since we give it up every
43*4882a593Smuzhiyun * switch (ie, no lazy save of the vector registers).
44*4882a593Smuzhiyun *
45*4882a593Smuzhiyun * Note that on 32-bit this can only use registers that will be
46*4882a593Smuzhiyun * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
47*4882a593Smuzhiyun */
48*4882a593Smuzhiyun_GLOBAL(load_up_altivec)
49*4882a593Smuzhiyun	mfmsr	r5			/* grab the current MSR */
50*4882a593Smuzhiyun	oris	r5,r5,MSR_VEC@h
51*4882a593Smuzhiyun	MTMSRD(r5)			/* enable use of AltiVec now */
52*4882a593Smuzhiyun	isync
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun	/*
55*4882a593Smuzhiyun	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
56*4882a593Smuzhiyun	 * to optimise userspace context save/restore. Whenever we take an
57*4882a593Smuzhiyun	 * altivec unavailable exception we must set VRSAVE to something non
58*4882a593Smuzhiyun	 * zero. Set it to all 1s. See also the programming note in the ISA.
59*4882a593Smuzhiyun	 */
60*4882a593Smuzhiyun	mfspr	r4,SPRN_VRSAVE
61*4882a593Smuzhiyun	cmpwi	0,r4,0
62*4882a593Smuzhiyun	bne+	1f
63*4882a593Smuzhiyun	li	r4,-1
64*4882a593Smuzhiyun	mtspr	SPRN_VRSAVE,r4
65*4882a593Smuzhiyun1:
66*4882a593Smuzhiyun	/* enable use of VMX after return */
67*4882a593Smuzhiyun#ifdef CONFIG_PPC32
68*4882a593Smuzhiyun	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
69*4882a593Smuzhiyun	oris	r9,r9,MSR_VEC@h
70*4882a593Smuzhiyun#ifdef CONFIG_VMAP_STACK
71*4882a593Smuzhiyun	tovirt(r5, r5)
72*4882a593Smuzhiyun#endif
73*4882a593Smuzhiyun#else
74*4882a593Smuzhiyun	ld	r4,PACACURRENT(r13)
75*4882a593Smuzhiyun	addi	r5,r4,THREAD		/* Get THREAD */
76*4882a593Smuzhiyun	oris	r12,r12,MSR_VEC@h
77*4882a593Smuzhiyun	std	r12,_MSR(r1)
78*4882a593Smuzhiyun#endif
79*4882a593Smuzhiyun	li	r4,1
80*4882a593Smuzhiyun	stb	r4,THREAD_LOAD_VEC(r5)
81*4882a593Smuzhiyun	addi	r6,r5,THREAD_VRSTATE
82*4882a593Smuzhiyun	li	r4,1
83*4882a593Smuzhiyun	li	r10,VRSTATE_VSCR
84*4882a593Smuzhiyun	stw	r4,THREAD_USED_VR(r5)
85*4882a593Smuzhiyun	lvx	v0,r10,r6
86*4882a593Smuzhiyun	mtvscr	v0
87*4882a593Smuzhiyun	REST_32VRS(0,r4,r6)
88*4882a593Smuzhiyun	/* restore registers and return */
89*4882a593Smuzhiyun	blr
90*4882a593Smuzhiyun_ASM_NOKPROBE_SYMBOL(load_up_altivec)
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun/*
93*4882a593Smuzhiyun * save_altivec(tsk)
94*4882a593Smuzhiyun * Save the vector registers to its thread_struct
95*4882a593Smuzhiyun */
96*4882a593Smuzhiyun_GLOBAL(save_altivec)
97*4882a593Smuzhiyun	addi	r3,r3,THREAD		/* want THREAD of task */
98*4882a593Smuzhiyun	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
99*4882a593Smuzhiyun	PPC_LL	r5,PT_REGS(r3)
100*4882a593Smuzhiyun	PPC_LCMPI	0,r7,0
101*4882a593Smuzhiyun	bne	2f
102*4882a593Smuzhiyun	addi	r7,r3,THREAD_VRSTATE
103*4882a593Smuzhiyun2:	SAVE_32VRS(0,r4,r7)
104*4882a593Smuzhiyun	mfvscr	v0
105*4882a593Smuzhiyun	li	r4,VRSTATE_VSCR
106*4882a593Smuzhiyun	stvx	v0,r4,r7
107*4882a593Smuzhiyun	blr
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun#ifdef CONFIG_VSX
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun#ifdef CONFIG_PPC32
112*4882a593Smuzhiyun#error This asm code isn't ready for 32-bit kernels
113*4882a593Smuzhiyun#endif
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun/*
116*4882a593Smuzhiyun * load_up_vsx(unused, unused, tsk)
117*4882a593Smuzhiyun * Disable VSX for the task which had it previously,
118*4882a593Smuzhiyun * and save its vector registers in its thread_struct.
119*4882a593Smuzhiyun * Reuse the fp and vsx saves, but first check to see if they have
120*4882a593Smuzhiyun * been saved already.
121*4882a593Smuzhiyun */
122*4882a593Smuzhiyun_GLOBAL(load_up_vsx)
123*4882a593Smuzhiyun/* Load FP and VSX registers if they haven't been done yet */
124*4882a593Smuzhiyun	andi.	r5,r12,MSR_FP
125*4882a593Smuzhiyun	beql+	load_up_fpu		/* skip if already loaded */
126*4882a593Smuzhiyun	andis.	r5,r12,MSR_VEC@h
127*4882a593Smuzhiyun	beql+	load_up_altivec		/* skip if already loaded */
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun	ld	r4,PACACURRENT(r13)
130*4882a593Smuzhiyun	addi	r4,r4,THREAD		/* Get THREAD */
131*4882a593Smuzhiyun	li	r6,1
132*4882a593Smuzhiyun	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
133*4882a593Smuzhiyun	/* enable use of VSX after return */
134*4882a593Smuzhiyun	oris	r12,r12,MSR_VSX@h
135*4882a593Smuzhiyun	std	r12,_MSR(r1)
136*4882a593Smuzhiyun	b	fast_interrupt_return
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun#endif /* CONFIG_VSX */
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun/*
142*4882a593Smuzhiyun * The routines below are in assembler so we can closely control the
143*4882a593Smuzhiyun * usage of floating-point registers.  These routines must be called
144*4882a593Smuzhiyun * with preempt disabled.
145*4882a593Smuzhiyun */
146*4882a593Smuzhiyun#ifdef CONFIG_PPC32
147*4882a593Smuzhiyun	.data
148*4882a593Smuzhiyunfpzero:
149*4882a593Smuzhiyun	.long	0
150*4882a593Smuzhiyunfpone:
151*4882a593Smuzhiyun	.long	0x3f800000	/* 1.0 in single-precision FP */
152*4882a593Smuzhiyunfphalf:
153*4882a593Smuzhiyun	.long	0x3f000000	/* 0.5 in single-precision FP */
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun#define LDCONST(fr, name)	\
156*4882a593Smuzhiyun	lis	r11,name@ha;	\
157*4882a593Smuzhiyun	lfs	fr,name@l(r11)
158*4882a593Smuzhiyun#else
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun	.section ".toc","aw"
161*4882a593Smuzhiyunfpzero:
162*4882a593Smuzhiyun	.tc	FD_0_0[TC],0
163*4882a593Smuzhiyunfpone:
164*4882a593Smuzhiyun	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
165*4882a593Smuzhiyunfphalf:
166*4882a593Smuzhiyun	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun#define LDCONST(fr, name)	\
169*4882a593Smuzhiyun	lfd	fr,name@toc(r2)
170*4882a593Smuzhiyun#endif
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun	.text
173*4882a593Smuzhiyun/*
174*4882a593Smuzhiyun * Internal routine to enable floating point and set FPSCR to 0.
175*4882a593Smuzhiyun * Don't call it from C; it doesn't use the normal calling convention.
176*4882a593Smuzhiyun */
177*4882a593Smuzhiyunfpenable:
178*4882a593Smuzhiyun#ifdef CONFIG_PPC32
179*4882a593Smuzhiyun	stwu	r1,-64(r1)
180*4882a593Smuzhiyun#else
181*4882a593Smuzhiyun	stdu	r1,-64(r1)
182*4882a593Smuzhiyun#endif
183*4882a593Smuzhiyun	mfmsr	r10
184*4882a593Smuzhiyun	ori	r11,r10,MSR_FP
185*4882a593Smuzhiyun	mtmsr	r11
186*4882a593Smuzhiyun	isync
187*4882a593Smuzhiyun	stfd	fr0,24(r1)
188*4882a593Smuzhiyun	stfd	fr1,16(r1)
189*4882a593Smuzhiyun	stfd	fr31,8(r1)
190*4882a593Smuzhiyun	LDCONST(fr1, fpzero)
191*4882a593Smuzhiyun	mffs	fr31
192*4882a593Smuzhiyun	MTFSF_L(fr1)
193*4882a593Smuzhiyun	blr
194*4882a593Smuzhiyun
195*4882a593Smuzhiyunfpdisable:
196*4882a593Smuzhiyun	mtlr	r12
197*4882a593Smuzhiyun	MTFSF_L(fr31)
198*4882a593Smuzhiyun	lfd	fr31,8(r1)
199*4882a593Smuzhiyun	lfd	fr1,16(r1)
200*4882a593Smuzhiyun	lfd	fr0,24(r1)
201*4882a593Smuzhiyun	mtmsr	r10
202*4882a593Smuzhiyun	isync
203*4882a593Smuzhiyun	addi	r1,r1,64
204*4882a593Smuzhiyun	blr
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun/*
207*4882a593Smuzhiyun * Vector add, floating point.
208*4882a593Smuzhiyun */
209*4882a593Smuzhiyun_GLOBAL(vaddfp)
210*4882a593Smuzhiyun	mflr	r12
211*4882a593Smuzhiyun	bl	fpenable
212*4882a593Smuzhiyun	li	r0,4
213*4882a593Smuzhiyun	mtctr	r0
214*4882a593Smuzhiyun	li	r6,0
215*4882a593Smuzhiyun1:	lfsx	fr0,r4,r6
216*4882a593Smuzhiyun	lfsx	fr1,r5,r6
217*4882a593Smuzhiyun	fadds	fr0,fr0,fr1
218*4882a593Smuzhiyun	stfsx	fr0,r3,r6
219*4882a593Smuzhiyun	addi	r6,r6,4
220*4882a593Smuzhiyun	bdnz	1b
221*4882a593Smuzhiyun	b	fpdisable
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun/*
224*4882a593Smuzhiyun * Vector subtract, floating point.
225*4882a593Smuzhiyun */
226*4882a593Smuzhiyun_GLOBAL(vsubfp)
227*4882a593Smuzhiyun	mflr	r12
228*4882a593Smuzhiyun	bl	fpenable
229*4882a593Smuzhiyun	li	r0,4
230*4882a593Smuzhiyun	mtctr	r0
231*4882a593Smuzhiyun	li	r6,0
232*4882a593Smuzhiyun1:	lfsx	fr0,r4,r6
233*4882a593Smuzhiyun	lfsx	fr1,r5,r6
234*4882a593Smuzhiyun	fsubs	fr0,fr0,fr1
235*4882a593Smuzhiyun	stfsx	fr0,r3,r6
236*4882a593Smuzhiyun	addi	r6,r6,4
237*4882a593Smuzhiyun	bdnz	1b
238*4882a593Smuzhiyun	b	fpdisable
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun/*
241*4882a593Smuzhiyun * Vector multiply and add, floating point.
242*4882a593Smuzhiyun */
243*4882a593Smuzhiyun_GLOBAL(vmaddfp)
244*4882a593Smuzhiyun	mflr	r12
245*4882a593Smuzhiyun	bl	fpenable
246*4882a593Smuzhiyun	stfd	fr2,32(r1)
247*4882a593Smuzhiyun	li	r0,4
248*4882a593Smuzhiyun	mtctr	r0
249*4882a593Smuzhiyun	li	r7,0
250*4882a593Smuzhiyun1:	lfsx	fr0,r4,r7
251*4882a593Smuzhiyun	lfsx	fr1,r5,r7
252*4882a593Smuzhiyun	lfsx	fr2,r6,r7
253*4882a593Smuzhiyun	fmadds	fr0,fr0,fr2,fr1
254*4882a593Smuzhiyun	stfsx	fr0,r3,r7
255*4882a593Smuzhiyun	addi	r7,r7,4
256*4882a593Smuzhiyun	bdnz	1b
257*4882a593Smuzhiyun	lfd	fr2,32(r1)
258*4882a593Smuzhiyun	b	fpdisable
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun/*
261*4882a593Smuzhiyun * Vector negative multiply and subtract, floating point.
262*4882a593Smuzhiyun */
263*4882a593Smuzhiyun_GLOBAL(vnmsubfp)
264*4882a593Smuzhiyun	mflr	r12
265*4882a593Smuzhiyun	bl	fpenable
266*4882a593Smuzhiyun	stfd	fr2,32(r1)
267*4882a593Smuzhiyun	li	r0,4
268*4882a593Smuzhiyun	mtctr	r0
269*4882a593Smuzhiyun	li	r7,0
270*4882a593Smuzhiyun1:	lfsx	fr0,r4,r7
271*4882a593Smuzhiyun	lfsx	fr1,r5,r7
272*4882a593Smuzhiyun	lfsx	fr2,r6,r7
273*4882a593Smuzhiyun	fnmsubs	fr0,fr0,fr2,fr1
274*4882a593Smuzhiyun	stfsx	fr0,r3,r7
275*4882a593Smuzhiyun	addi	r7,r7,4
276*4882a593Smuzhiyun	bdnz	1b
277*4882a593Smuzhiyun	lfd	fr2,32(r1)
278*4882a593Smuzhiyun	b	fpdisable
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun/*
281*4882a593Smuzhiyun * Vector reciprocal estimate.  We just compute 1.0/x.
282*4882a593Smuzhiyun * r3 -> destination, r4 -> source.
283*4882a593Smuzhiyun */
284*4882a593Smuzhiyun_GLOBAL(vrefp)
285*4882a593Smuzhiyun	mflr	r12
286*4882a593Smuzhiyun	bl	fpenable
287*4882a593Smuzhiyun	li	r0,4
288*4882a593Smuzhiyun	LDCONST(fr1, fpone)
289*4882a593Smuzhiyun	mtctr	r0
290*4882a593Smuzhiyun	li	r6,0
291*4882a593Smuzhiyun1:	lfsx	fr0,r4,r6
292*4882a593Smuzhiyun	fdivs	fr0,fr1,fr0
293*4882a593Smuzhiyun	stfsx	fr0,r3,r6
294*4882a593Smuzhiyun	addi	r6,r6,4
295*4882a593Smuzhiyun	bdnz	1b
296*4882a593Smuzhiyun	b	fpdisable
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun/*
299*4882a593Smuzhiyun * Vector reciprocal square-root estimate, floating point.
300*4882a593Smuzhiyun * We use the frsqrte instruction for the initial estimate followed
301*4882a593Smuzhiyun * by 2 iterations of Newton-Raphson to get sufficient accuracy.
302*4882a593Smuzhiyun * r3 -> destination, r4 -> source.
303*4882a593Smuzhiyun */
304*4882a593Smuzhiyun_GLOBAL(vrsqrtefp)
305*4882a593Smuzhiyun	mflr	r12
306*4882a593Smuzhiyun	bl	fpenable
307*4882a593Smuzhiyun	stfd	fr2,32(r1)
308*4882a593Smuzhiyun	stfd	fr3,40(r1)
309*4882a593Smuzhiyun	stfd	fr4,48(r1)
310*4882a593Smuzhiyun	stfd	fr5,56(r1)
311*4882a593Smuzhiyun	li	r0,4
312*4882a593Smuzhiyun	LDCONST(fr4, fpone)
313*4882a593Smuzhiyun	LDCONST(fr5, fphalf)
314*4882a593Smuzhiyun	mtctr	r0
315*4882a593Smuzhiyun	li	r6,0
316*4882a593Smuzhiyun1:	lfsx	fr0,r4,r6
317*4882a593Smuzhiyun	frsqrte	fr1,fr0		/* r = frsqrte(s) */
318*4882a593Smuzhiyun	fmuls	fr3,fr1,fr0	/* r * s */
319*4882a593Smuzhiyun	fmuls	fr2,fr1,fr5	/* r * 0.5 */
320*4882a593Smuzhiyun	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
321*4882a593Smuzhiyun	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
322*4882a593Smuzhiyun	fmuls	fr3,fr1,fr0	/* r * s */
323*4882a593Smuzhiyun	fmuls	fr2,fr1,fr5	/* r * 0.5 */
324*4882a593Smuzhiyun	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
325*4882a593Smuzhiyun	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
326*4882a593Smuzhiyun	stfsx	fr1,r3,r6
327*4882a593Smuzhiyun	addi	r6,r6,4
328*4882a593Smuzhiyun	bdnz	1b
329*4882a593Smuzhiyun	lfd	fr5,56(r1)
330*4882a593Smuzhiyun	lfd	fr4,48(r1)
331*4882a593Smuzhiyun	lfd	fr3,40(r1)
332*4882a593Smuzhiyun	lfd	fr2,32(r1)
333*4882a593Smuzhiyun	b	fpdisable
334