xref: /OK3568_Linux_fs/kernel/arch/powerpc/lib/copyuser_power7.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) IBM Corporation, 2011
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Author: Anton Blanchard <anton@au.ibm.com>
7*4882a593Smuzhiyun */
8*4882a593Smuzhiyun#include <asm/ppc_asm.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#ifndef SELFTEST_CASE
11*4882a593Smuzhiyun/* 0 == don't use VMX, 1 == use VMX */
12*4882a593Smuzhiyun#define SELFTEST_CASE	0
13*4882a593Smuzhiyun#endif
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__
16*4882a593Smuzhiyun#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
17*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
18*4882a593Smuzhiyun#else
19*4882a593Smuzhiyun#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
20*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
21*4882a593Smuzhiyun#endif
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun	.macro err1
24*4882a593Smuzhiyun100:
25*4882a593Smuzhiyun	EX_TABLE(100b,.Ldo_err1)
26*4882a593Smuzhiyun	.endm
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun	.macro err2
29*4882a593Smuzhiyun200:
30*4882a593Smuzhiyun	EX_TABLE(200b,.Ldo_err2)
31*4882a593Smuzhiyun	.endm
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
34*4882a593Smuzhiyun	.macro err3
35*4882a593Smuzhiyun300:
36*4882a593Smuzhiyun	EX_TABLE(300b,.Ldo_err3)
37*4882a593Smuzhiyun	.endm
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun	.macro err4
40*4882a593Smuzhiyun400:
41*4882a593Smuzhiyun	EX_TABLE(400b,.Ldo_err4)
42*4882a593Smuzhiyun	.endm
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun.Ldo_err4:
46*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
47*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
48*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
49*4882a593Smuzhiyun.Ldo_err3:
50*4882a593Smuzhiyun	bl	exit_vmx_usercopy
51*4882a593Smuzhiyun	ld	r0,STACKFRAMESIZE+16(r1)
52*4882a593Smuzhiyun	mtlr	r0
53*4882a593Smuzhiyun	b	.Lexit
54*4882a593Smuzhiyun#endif /* CONFIG_ALTIVEC */
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun.Ldo_err2:
57*4882a593Smuzhiyun	ld	r22,STK_REG(R22)(r1)
58*4882a593Smuzhiyun	ld	r21,STK_REG(R21)(r1)
59*4882a593Smuzhiyun	ld	r20,STK_REG(R20)(r1)
60*4882a593Smuzhiyun	ld	r19,STK_REG(R19)(r1)
61*4882a593Smuzhiyun	ld	r18,STK_REG(R18)(r1)
62*4882a593Smuzhiyun	ld	r17,STK_REG(R17)(r1)
63*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
64*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
65*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
66*4882a593Smuzhiyun.Lexit:
67*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
68*4882a593Smuzhiyun.Ldo_err1:
69*4882a593Smuzhiyun	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70*4882a593Smuzhiyun	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71*4882a593Smuzhiyun	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72*4882a593Smuzhiyun	b	__copy_tofrom_user_base
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun_GLOBAL(__copy_tofrom_user_power7)
76*4882a593Smuzhiyun	cmpldi	r5,16
77*4882a593Smuzhiyun	cmpldi	cr1,r5,3328
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80*4882a593Smuzhiyun	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81*4882a593Smuzhiyun	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	blt	.Lshort_copy
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
86*4882a593Smuzhiyuntest_feature = SELFTEST_CASE
87*4882a593SmuzhiyunBEGIN_FTR_SECTION
88*4882a593Smuzhiyun	bgt	cr1,.Lvmx_copy
89*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90*4882a593Smuzhiyun#endif
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun.Lnonvmx_copy:
93*4882a593Smuzhiyun	/* Get the source 8B aligned */
94*4882a593Smuzhiyun	neg	r6,r4
95*4882a593Smuzhiyun	mtocrf	0x01,r6
96*4882a593Smuzhiyun	clrldi	r6,r6,(64-3)
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun	bf	cr7*4+3,1f
99*4882a593Smuzhiyunerr1;	lbz	r0,0(r4)
100*4882a593Smuzhiyun	addi	r4,r4,1
101*4882a593Smuzhiyunerr1;	stb	r0,0(r3)
102*4882a593Smuzhiyun	addi	r3,r3,1
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun1:	bf	cr7*4+2,2f
105*4882a593Smuzhiyunerr1;	lhz	r0,0(r4)
106*4882a593Smuzhiyun	addi	r4,r4,2
107*4882a593Smuzhiyunerr1;	sth	r0,0(r3)
108*4882a593Smuzhiyun	addi	r3,r3,2
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun2:	bf	cr7*4+1,3f
111*4882a593Smuzhiyunerr1;	lwz	r0,0(r4)
112*4882a593Smuzhiyun	addi	r4,r4,4
113*4882a593Smuzhiyunerr1;	stw	r0,0(r3)
114*4882a593Smuzhiyun	addi	r3,r3,4
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun3:	sub	r5,r5,r6
117*4882a593Smuzhiyun	cmpldi	r5,128
118*4882a593Smuzhiyun	blt	5f
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun	mflr	r0
121*4882a593Smuzhiyun	stdu	r1,-STACKFRAMESIZE(r1)
122*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
123*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
124*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
125*4882a593Smuzhiyun	std	r17,STK_REG(R17)(r1)
126*4882a593Smuzhiyun	std	r18,STK_REG(R18)(r1)
127*4882a593Smuzhiyun	std	r19,STK_REG(R19)(r1)
128*4882a593Smuzhiyun	std	r20,STK_REG(R20)(r1)
129*4882a593Smuzhiyun	std	r21,STK_REG(R21)(r1)
130*4882a593Smuzhiyun	std	r22,STK_REG(R22)(r1)
131*4882a593Smuzhiyun	std	r0,STACKFRAMESIZE+16(r1)
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun	srdi	r6,r5,7
134*4882a593Smuzhiyun	mtctr	r6
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun	/* Now do cacheline (128B) sized loads and stores. */
137*4882a593Smuzhiyun	.align	5
138*4882a593Smuzhiyun4:
139*4882a593Smuzhiyunerr2;	ld	r0,0(r4)
140*4882a593Smuzhiyunerr2;	ld	r6,8(r4)
141*4882a593Smuzhiyunerr2;	ld	r7,16(r4)
142*4882a593Smuzhiyunerr2;	ld	r8,24(r4)
143*4882a593Smuzhiyunerr2;	ld	r9,32(r4)
144*4882a593Smuzhiyunerr2;	ld	r10,40(r4)
145*4882a593Smuzhiyunerr2;	ld	r11,48(r4)
146*4882a593Smuzhiyunerr2;	ld	r12,56(r4)
147*4882a593Smuzhiyunerr2;	ld	r14,64(r4)
148*4882a593Smuzhiyunerr2;	ld	r15,72(r4)
149*4882a593Smuzhiyunerr2;	ld	r16,80(r4)
150*4882a593Smuzhiyunerr2;	ld	r17,88(r4)
151*4882a593Smuzhiyunerr2;	ld	r18,96(r4)
152*4882a593Smuzhiyunerr2;	ld	r19,104(r4)
153*4882a593Smuzhiyunerr2;	ld	r20,112(r4)
154*4882a593Smuzhiyunerr2;	ld	r21,120(r4)
155*4882a593Smuzhiyun	addi	r4,r4,128
156*4882a593Smuzhiyunerr2;	std	r0,0(r3)
157*4882a593Smuzhiyunerr2;	std	r6,8(r3)
158*4882a593Smuzhiyunerr2;	std	r7,16(r3)
159*4882a593Smuzhiyunerr2;	std	r8,24(r3)
160*4882a593Smuzhiyunerr2;	std	r9,32(r3)
161*4882a593Smuzhiyunerr2;	std	r10,40(r3)
162*4882a593Smuzhiyunerr2;	std	r11,48(r3)
163*4882a593Smuzhiyunerr2;	std	r12,56(r3)
164*4882a593Smuzhiyunerr2;	std	r14,64(r3)
165*4882a593Smuzhiyunerr2;	std	r15,72(r3)
166*4882a593Smuzhiyunerr2;	std	r16,80(r3)
167*4882a593Smuzhiyunerr2;	std	r17,88(r3)
168*4882a593Smuzhiyunerr2;	std	r18,96(r3)
169*4882a593Smuzhiyunerr2;	std	r19,104(r3)
170*4882a593Smuzhiyunerr2;	std	r20,112(r3)
171*4882a593Smuzhiyunerr2;	std	r21,120(r3)
172*4882a593Smuzhiyun	addi	r3,r3,128
173*4882a593Smuzhiyun	bdnz	4b
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun	clrldi	r5,r5,(64-7)
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
178*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
179*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
180*4882a593Smuzhiyun	ld	r17,STK_REG(R17)(r1)
181*4882a593Smuzhiyun	ld	r18,STK_REG(R18)(r1)
182*4882a593Smuzhiyun	ld	r19,STK_REG(R19)(r1)
183*4882a593Smuzhiyun	ld	r20,STK_REG(R20)(r1)
184*4882a593Smuzhiyun	ld	r21,STK_REG(R21)(r1)
185*4882a593Smuzhiyun	ld	r22,STK_REG(R22)(r1)
186*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	/* Up to 127B to go */
189*4882a593Smuzhiyun5:	srdi	r6,r5,4
190*4882a593Smuzhiyun	mtocrf	0x01,r6
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun6:	bf	cr7*4+1,7f
193*4882a593Smuzhiyunerr1;	ld	r0,0(r4)
194*4882a593Smuzhiyunerr1;	ld	r6,8(r4)
195*4882a593Smuzhiyunerr1;	ld	r7,16(r4)
196*4882a593Smuzhiyunerr1;	ld	r8,24(r4)
197*4882a593Smuzhiyunerr1;	ld	r9,32(r4)
198*4882a593Smuzhiyunerr1;	ld	r10,40(r4)
199*4882a593Smuzhiyunerr1;	ld	r11,48(r4)
200*4882a593Smuzhiyunerr1;	ld	r12,56(r4)
201*4882a593Smuzhiyun	addi	r4,r4,64
202*4882a593Smuzhiyunerr1;	std	r0,0(r3)
203*4882a593Smuzhiyunerr1;	std	r6,8(r3)
204*4882a593Smuzhiyunerr1;	std	r7,16(r3)
205*4882a593Smuzhiyunerr1;	std	r8,24(r3)
206*4882a593Smuzhiyunerr1;	std	r9,32(r3)
207*4882a593Smuzhiyunerr1;	std	r10,40(r3)
208*4882a593Smuzhiyunerr1;	std	r11,48(r3)
209*4882a593Smuzhiyunerr1;	std	r12,56(r3)
210*4882a593Smuzhiyun	addi	r3,r3,64
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun	/* Up to 63B to go */
213*4882a593Smuzhiyun7:	bf	cr7*4+2,8f
214*4882a593Smuzhiyunerr1;	ld	r0,0(r4)
215*4882a593Smuzhiyunerr1;	ld	r6,8(r4)
216*4882a593Smuzhiyunerr1;	ld	r7,16(r4)
217*4882a593Smuzhiyunerr1;	ld	r8,24(r4)
218*4882a593Smuzhiyun	addi	r4,r4,32
219*4882a593Smuzhiyunerr1;	std	r0,0(r3)
220*4882a593Smuzhiyunerr1;	std	r6,8(r3)
221*4882a593Smuzhiyunerr1;	std	r7,16(r3)
222*4882a593Smuzhiyunerr1;	std	r8,24(r3)
223*4882a593Smuzhiyun	addi	r3,r3,32
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun	/* Up to 31B to go */
226*4882a593Smuzhiyun8:	bf	cr7*4+3,9f
227*4882a593Smuzhiyunerr1;	ld	r0,0(r4)
228*4882a593Smuzhiyunerr1;	ld	r6,8(r4)
229*4882a593Smuzhiyun	addi	r4,r4,16
230*4882a593Smuzhiyunerr1;	std	r0,0(r3)
231*4882a593Smuzhiyunerr1;	std	r6,8(r3)
232*4882a593Smuzhiyun	addi	r3,r3,16
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun9:	clrldi	r5,r5,(64-4)
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun	/* Up to 15B to go */
237*4882a593Smuzhiyun.Lshort_copy:
238*4882a593Smuzhiyun	mtocrf	0x01,r5
239*4882a593Smuzhiyun	bf	cr7*4+0,12f
240*4882a593Smuzhiyunerr1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
241*4882a593Smuzhiyunerr1;	lwz	r6,4(r4)
242*4882a593Smuzhiyun	addi	r4,r4,8
243*4882a593Smuzhiyunerr1;	stw	r0,0(r3)
244*4882a593Smuzhiyunerr1;	stw	r6,4(r3)
245*4882a593Smuzhiyun	addi	r3,r3,8
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun12:	bf	cr7*4+1,13f
248*4882a593Smuzhiyunerr1;	lwz	r0,0(r4)
249*4882a593Smuzhiyun	addi	r4,r4,4
250*4882a593Smuzhiyunerr1;	stw	r0,0(r3)
251*4882a593Smuzhiyun	addi	r3,r3,4
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun13:	bf	cr7*4+2,14f
254*4882a593Smuzhiyunerr1;	lhz	r0,0(r4)
255*4882a593Smuzhiyun	addi	r4,r4,2
256*4882a593Smuzhiyunerr1;	sth	r0,0(r3)
257*4882a593Smuzhiyun	addi	r3,r3,2
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun14:	bf	cr7*4+3,15f
260*4882a593Smuzhiyunerr1;	lbz	r0,0(r4)
261*4882a593Smuzhiyunerr1;	stb	r0,0(r3)
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun15:	li	r3,0
264*4882a593Smuzhiyun	blr
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun.Lunwind_stack_nonvmx_copy:
267*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
268*4882a593Smuzhiyun	b	.Lnonvmx_copy
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun.Lvmx_copy:
271*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
272*4882a593Smuzhiyun	mflr	r0
273*4882a593Smuzhiyun	std	r0,16(r1)
274*4882a593Smuzhiyun	stdu	r1,-STACKFRAMESIZE(r1)
275*4882a593Smuzhiyun	bl	enter_vmx_usercopy
276*4882a593Smuzhiyun	cmpwi	cr1,r3,0
277*4882a593Smuzhiyun	ld	r0,STACKFRAMESIZE+16(r1)
278*4882a593Smuzhiyun	ld	r3,STK_REG(R31)(r1)
279*4882a593Smuzhiyun	ld	r4,STK_REG(R30)(r1)
280*4882a593Smuzhiyun	ld	r5,STK_REG(R29)(r1)
281*4882a593Smuzhiyun	mtlr	r0
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun	/*
284*4882a593Smuzhiyun	 * We prefetch both the source and destination using enhanced touch
285*4882a593Smuzhiyun	 * instructions. We use a stream ID of 0 for the load side and
286*4882a593Smuzhiyun	 * 1 for the store side.
287*4882a593Smuzhiyun	 */
288*4882a593Smuzhiyun	clrrdi	r6,r4,7
289*4882a593Smuzhiyun	clrrdi	r9,r3,7
290*4882a593Smuzhiyun	ori	r9,r9,1		/* stream=1 */
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
293*4882a593Smuzhiyun	cmpldi	r7,0x3FF
294*4882a593Smuzhiyun	ble	1f
295*4882a593Smuzhiyun	li	r7,0x3FF
296*4882a593Smuzhiyun1:	lis	r0,0x0E00	/* depth=7 */
297*4882a593Smuzhiyun	sldi	r7,r7,7
298*4882a593Smuzhiyun	or	r7,r7,r0
299*4882a593Smuzhiyun	ori	r10,r7,1	/* stream=1 */
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun	lis	r8,0x8000	/* GO=1 */
302*4882a593Smuzhiyun	clrldi	r8,r8,32
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun	/* setup read stream 0 */
305*4882a593Smuzhiyun	dcbt	0,r6,0b01000   /* addr from */
306*4882a593Smuzhiyun	dcbt	0,r7,0b01010   /* length and depth from */
307*4882a593Smuzhiyun	/* setup write stream 1 */
308*4882a593Smuzhiyun	dcbtst	0,r9,0b01000   /* addr to */
309*4882a593Smuzhiyun	dcbtst	0,r10,0b01010  /* length and depth to */
310*4882a593Smuzhiyun	eieio
311*4882a593Smuzhiyun	dcbt	0,r8,0b01010	/* all streams GO */
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun	beq	cr1,.Lunwind_stack_nonvmx_copy
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	/*
316*4882a593Smuzhiyun	 * If source and destination are not relatively aligned we use a
317*4882a593Smuzhiyun	 * slower permute loop.
318*4882a593Smuzhiyun	 */
319*4882a593Smuzhiyun	xor	r6,r4,r3
320*4882a593Smuzhiyun	rldicl.	r6,r6,0,(64-4)
321*4882a593Smuzhiyun	bne	.Lvmx_unaligned_copy
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun	/* Get the destination 16B aligned */
324*4882a593Smuzhiyun	neg	r6,r3
325*4882a593Smuzhiyun	mtocrf	0x01,r6
326*4882a593Smuzhiyun	clrldi	r6,r6,(64-4)
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun	bf	cr7*4+3,1f
329*4882a593Smuzhiyunerr3;	lbz	r0,0(r4)
330*4882a593Smuzhiyun	addi	r4,r4,1
331*4882a593Smuzhiyunerr3;	stb	r0,0(r3)
332*4882a593Smuzhiyun	addi	r3,r3,1
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun1:	bf	cr7*4+2,2f
335*4882a593Smuzhiyunerr3;	lhz	r0,0(r4)
336*4882a593Smuzhiyun	addi	r4,r4,2
337*4882a593Smuzhiyunerr3;	sth	r0,0(r3)
338*4882a593Smuzhiyun	addi	r3,r3,2
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun2:	bf	cr7*4+1,3f
341*4882a593Smuzhiyunerr3;	lwz	r0,0(r4)
342*4882a593Smuzhiyun	addi	r4,r4,4
343*4882a593Smuzhiyunerr3;	stw	r0,0(r3)
344*4882a593Smuzhiyun	addi	r3,r3,4
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun3:	bf	cr7*4+0,4f
347*4882a593Smuzhiyunerr3;	ld	r0,0(r4)
348*4882a593Smuzhiyun	addi	r4,r4,8
349*4882a593Smuzhiyunerr3;	std	r0,0(r3)
350*4882a593Smuzhiyun	addi	r3,r3,8
351*4882a593Smuzhiyun
352*4882a593Smuzhiyun4:	sub	r5,r5,r6
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	/* Get the desination 128B aligned */
355*4882a593Smuzhiyun	neg	r6,r3
356*4882a593Smuzhiyun	srdi	r7,r6,4
357*4882a593Smuzhiyun	mtocrf	0x01,r7
358*4882a593Smuzhiyun	clrldi	r6,r6,(64-7)
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun	li	r9,16
361*4882a593Smuzhiyun	li	r10,32
362*4882a593Smuzhiyun	li	r11,48
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	bf	cr7*4+3,5f
365*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
366*4882a593Smuzhiyun	addi	r4,r4,16
367*4882a593Smuzhiyunerr3;	stvx	v1,0,r3
368*4882a593Smuzhiyun	addi	r3,r3,16
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun5:	bf	cr7*4+2,6f
371*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
372*4882a593Smuzhiyunerr3;	lvx	v0,r4,r9
373*4882a593Smuzhiyun	addi	r4,r4,32
374*4882a593Smuzhiyunerr3;	stvx	v1,0,r3
375*4882a593Smuzhiyunerr3;	stvx	v0,r3,r9
376*4882a593Smuzhiyun	addi	r3,r3,32
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun6:	bf	cr7*4+1,7f
379*4882a593Smuzhiyunerr3;	lvx	v3,0,r4
380*4882a593Smuzhiyunerr3;	lvx	v2,r4,r9
381*4882a593Smuzhiyunerr3;	lvx	v1,r4,r10
382*4882a593Smuzhiyunerr3;	lvx	v0,r4,r11
383*4882a593Smuzhiyun	addi	r4,r4,64
384*4882a593Smuzhiyunerr3;	stvx	v3,0,r3
385*4882a593Smuzhiyunerr3;	stvx	v2,r3,r9
386*4882a593Smuzhiyunerr3;	stvx	v1,r3,r10
387*4882a593Smuzhiyunerr3;	stvx	v0,r3,r11
388*4882a593Smuzhiyun	addi	r3,r3,64
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun7:	sub	r5,r5,r6
391*4882a593Smuzhiyun	srdi	r6,r5,7
392*4882a593Smuzhiyun
393*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
394*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
395*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun	li	r12,64
398*4882a593Smuzhiyun	li	r14,80
399*4882a593Smuzhiyun	li	r15,96
400*4882a593Smuzhiyun	li	r16,112
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun	mtctr	r6
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun	/*
405*4882a593Smuzhiyun	 * Now do cacheline sized loads and stores. By this stage the
406*4882a593Smuzhiyun	 * cacheline stores are also cacheline aligned.
407*4882a593Smuzhiyun	 */
408*4882a593Smuzhiyun	.align	5
409*4882a593Smuzhiyun8:
410*4882a593Smuzhiyunerr4;	lvx	v7,0,r4
411*4882a593Smuzhiyunerr4;	lvx	v6,r4,r9
412*4882a593Smuzhiyunerr4;	lvx	v5,r4,r10
413*4882a593Smuzhiyunerr4;	lvx	v4,r4,r11
414*4882a593Smuzhiyunerr4;	lvx	v3,r4,r12
415*4882a593Smuzhiyunerr4;	lvx	v2,r4,r14
416*4882a593Smuzhiyunerr4;	lvx	v1,r4,r15
417*4882a593Smuzhiyunerr4;	lvx	v0,r4,r16
418*4882a593Smuzhiyun	addi	r4,r4,128
419*4882a593Smuzhiyunerr4;	stvx	v7,0,r3
420*4882a593Smuzhiyunerr4;	stvx	v6,r3,r9
421*4882a593Smuzhiyunerr4;	stvx	v5,r3,r10
422*4882a593Smuzhiyunerr4;	stvx	v4,r3,r11
423*4882a593Smuzhiyunerr4;	stvx	v3,r3,r12
424*4882a593Smuzhiyunerr4;	stvx	v2,r3,r14
425*4882a593Smuzhiyunerr4;	stvx	v1,r3,r15
426*4882a593Smuzhiyunerr4;	stvx	v0,r3,r16
427*4882a593Smuzhiyun	addi	r3,r3,128
428*4882a593Smuzhiyun	bdnz	8b
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
431*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
432*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun	/* Up to 127B to go */
435*4882a593Smuzhiyun	clrldi	r5,r5,(64-7)
436*4882a593Smuzhiyun	srdi	r6,r5,4
437*4882a593Smuzhiyun	mtocrf	0x01,r6
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun	bf	cr7*4+1,9f
440*4882a593Smuzhiyunerr3;	lvx	v3,0,r4
441*4882a593Smuzhiyunerr3;	lvx	v2,r4,r9
442*4882a593Smuzhiyunerr3;	lvx	v1,r4,r10
443*4882a593Smuzhiyunerr3;	lvx	v0,r4,r11
444*4882a593Smuzhiyun	addi	r4,r4,64
445*4882a593Smuzhiyunerr3;	stvx	v3,0,r3
446*4882a593Smuzhiyunerr3;	stvx	v2,r3,r9
447*4882a593Smuzhiyunerr3;	stvx	v1,r3,r10
448*4882a593Smuzhiyunerr3;	stvx	v0,r3,r11
449*4882a593Smuzhiyun	addi	r3,r3,64
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun9:	bf	cr7*4+2,10f
452*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
453*4882a593Smuzhiyunerr3;	lvx	v0,r4,r9
454*4882a593Smuzhiyun	addi	r4,r4,32
455*4882a593Smuzhiyunerr3;	stvx	v1,0,r3
456*4882a593Smuzhiyunerr3;	stvx	v0,r3,r9
457*4882a593Smuzhiyun	addi	r3,r3,32
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun10:	bf	cr7*4+3,11f
460*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
461*4882a593Smuzhiyun	addi	r4,r4,16
462*4882a593Smuzhiyunerr3;	stvx	v1,0,r3
463*4882a593Smuzhiyun	addi	r3,r3,16
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun	/* Up to 15B to go */
466*4882a593Smuzhiyun11:	clrldi	r5,r5,(64-4)
467*4882a593Smuzhiyun	mtocrf	0x01,r5
468*4882a593Smuzhiyun	bf	cr7*4+0,12f
469*4882a593Smuzhiyunerr3;	ld	r0,0(r4)
470*4882a593Smuzhiyun	addi	r4,r4,8
471*4882a593Smuzhiyunerr3;	std	r0,0(r3)
472*4882a593Smuzhiyun	addi	r3,r3,8
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun12:	bf	cr7*4+1,13f
475*4882a593Smuzhiyunerr3;	lwz	r0,0(r4)
476*4882a593Smuzhiyun	addi	r4,r4,4
477*4882a593Smuzhiyunerr3;	stw	r0,0(r3)
478*4882a593Smuzhiyun	addi	r3,r3,4
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun13:	bf	cr7*4+2,14f
481*4882a593Smuzhiyunerr3;	lhz	r0,0(r4)
482*4882a593Smuzhiyun	addi	r4,r4,2
483*4882a593Smuzhiyunerr3;	sth	r0,0(r3)
484*4882a593Smuzhiyun	addi	r3,r3,2
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun14:	bf	cr7*4+3,15f
487*4882a593Smuzhiyunerr3;	lbz	r0,0(r4)
488*4882a593Smuzhiyunerr3;	stb	r0,0(r3)
489*4882a593Smuzhiyun
490*4882a593Smuzhiyun15:	addi	r1,r1,STACKFRAMESIZE
491*4882a593Smuzhiyun	b	exit_vmx_usercopy	/* tail call optimise */
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun.Lvmx_unaligned_copy:
494*4882a593Smuzhiyun	/* Get the destination 16B aligned */
495*4882a593Smuzhiyun	neg	r6,r3
496*4882a593Smuzhiyun	mtocrf	0x01,r6
497*4882a593Smuzhiyun	clrldi	r6,r6,(64-4)
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun	bf	cr7*4+3,1f
500*4882a593Smuzhiyunerr3;	lbz	r0,0(r4)
501*4882a593Smuzhiyun	addi	r4,r4,1
502*4882a593Smuzhiyunerr3;	stb	r0,0(r3)
503*4882a593Smuzhiyun	addi	r3,r3,1
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun1:	bf	cr7*4+2,2f
506*4882a593Smuzhiyunerr3;	lhz	r0,0(r4)
507*4882a593Smuzhiyun	addi	r4,r4,2
508*4882a593Smuzhiyunerr3;	sth	r0,0(r3)
509*4882a593Smuzhiyun	addi	r3,r3,2
510*4882a593Smuzhiyun
511*4882a593Smuzhiyun2:	bf	cr7*4+1,3f
512*4882a593Smuzhiyunerr3;	lwz	r0,0(r4)
513*4882a593Smuzhiyun	addi	r4,r4,4
514*4882a593Smuzhiyunerr3;	stw	r0,0(r3)
515*4882a593Smuzhiyun	addi	r3,r3,4
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun3:	bf	cr7*4+0,4f
518*4882a593Smuzhiyunerr3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
519*4882a593Smuzhiyunerr3;	lwz	r7,4(r4)
520*4882a593Smuzhiyun	addi	r4,r4,8
521*4882a593Smuzhiyunerr3;	stw	r0,0(r3)
522*4882a593Smuzhiyunerr3;	stw	r7,4(r3)
523*4882a593Smuzhiyun	addi	r3,r3,8
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun4:	sub	r5,r5,r6
526*4882a593Smuzhiyun
527*4882a593Smuzhiyun	/* Get the desination 128B aligned */
528*4882a593Smuzhiyun	neg	r6,r3
529*4882a593Smuzhiyun	srdi	r7,r6,4
530*4882a593Smuzhiyun	mtocrf	0x01,r7
531*4882a593Smuzhiyun	clrldi	r6,r6,(64-7)
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun	li	r9,16
534*4882a593Smuzhiyun	li	r10,32
535*4882a593Smuzhiyun	li	r11,48
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun	LVS(v16,0,r4)		/* Setup permute control vector */
538*4882a593Smuzhiyunerr3;	lvx	v0,0,r4
539*4882a593Smuzhiyun	addi	r4,r4,16
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun	bf	cr7*4+3,5f
542*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
543*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
544*4882a593Smuzhiyun	addi	r4,r4,16
545*4882a593Smuzhiyunerr3;	stvx	v8,0,r3
546*4882a593Smuzhiyun	addi	r3,r3,16
547*4882a593Smuzhiyun	vor	v0,v1,v1
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun5:	bf	cr7*4+2,6f
550*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
551*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
552*4882a593Smuzhiyunerr3;	lvx	v0,r4,r9
553*4882a593Smuzhiyun	VPERM(v9,v1,v0,v16)
554*4882a593Smuzhiyun	addi	r4,r4,32
555*4882a593Smuzhiyunerr3;	stvx	v8,0,r3
556*4882a593Smuzhiyunerr3;	stvx	v9,r3,r9
557*4882a593Smuzhiyun	addi	r3,r3,32
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun6:	bf	cr7*4+1,7f
560*4882a593Smuzhiyunerr3;	lvx	v3,0,r4
561*4882a593Smuzhiyun	VPERM(v8,v0,v3,v16)
562*4882a593Smuzhiyunerr3;	lvx	v2,r4,r9
563*4882a593Smuzhiyun	VPERM(v9,v3,v2,v16)
564*4882a593Smuzhiyunerr3;	lvx	v1,r4,r10
565*4882a593Smuzhiyun	VPERM(v10,v2,v1,v16)
566*4882a593Smuzhiyunerr3;	lvx	v0,r4,r11
567*4882a593Smuzhiyun	VPERM(v11,v1,v0,v16)
568*4882a593Smuzhiyun	addi	r4,r4,64
569*4882a593Smuzhiyunerr3;	stvx	v8,0,r3
570*4882a593Smuzhiyunerr3;	stvx	v9,r3,r9
571*4882a593Smuzhiyunerr3;	stvx	v10,r3,r10
572*4882a593Smuzhiyunerr3;	stvx	v11,r3,r11
573*4882a593Smuzhiyun	addi	r3,r3,64
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun7:	sub	r5,r5,r6
576*4882a593Smuzhiyun	srdi	r6,r5,7
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
579*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
580*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun	li	r12,64
583*4882a593Smuzhiyun	li	r14,80
584*4882a593Smuzhiyun	li	r15,96
585*4882a593Smuzhiyun	li	r16,112
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun	mtctr	r6
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun	/*
590*4882a593Smuzhiyun	 * Now do cacheline sized loads and stores. By this stage the
591*4882a593Smuzhiyun	 * cacheline stores are also cacheline aligned.
592*4882a593Smuzhiyun	 */
593*4882a593Smuzhiyun	.align	5
594*4882a593Smuzhiyun8:
595*4882a593Smuzhiyunerr4;	lvx	v7,0,r4
596*4882a593Smuzhiyun	VPERM(v8,v0,v7,v16)
597*4882a593Smuzhiyunerr4;	lvx	v6,r4,r9
598*4882a593Smuzhiyun	VPERM(v9,v7,v6,v16)
599*4882a593Smuzhiyunerr4;	lvx	v5,r4,r10
600*4882a593Smuzhiyun	VPERM(v10,v6,v5,v16)
601*4882a593Smuzhiyunerr4;	lvx	v4,r4,r11
602*4882a593Smuzhiyun	VPERM(v11,v5,v4,v16)
603*4882a593Smuzhiyunerr4;	lvx	v3,r4,r12
604*4882a593Smuzhiyun	VPERM(v12,v4,v3,v16)
605*4882a593Smuzhiyunerr4;	lvx	v2,r4,r14
606*4882a593Smuzhiyun	VPERM(v13,v3,v2,v16)
607*4882a593Smuzhiyunerr4;	lvx	v1,r4,r15
608*4882a593Smuzhiyun	VPERM(v14,v2,v1,v16)
609*4882a593Smuzhiyunerr4;	lvx	v0,r4,r16
610*4882a593Smuzhiyun	VPERM(v15,v1,v0,v16)
611*4882a593Smuzhiyun	addi	r4,r4,128
612*4882a593Smuzhiyunerr4;	stvx	v8,0,r3
613*4882a593Smuzhiyunerr4;	stvx	v9,r3,r9
614*4882a593Smuzhiyunerr4;	stvx	v10,r3,r10
615*4882a593Smuzhiyunerr4;	stvx	v11,r3,r11
616*4882a593Smuzhiyunerr4;	stvx	v12,r3,r12
617*4882a593Smuzhiyunerr4;	stvx	v13,r3,r14
618*4882a593Smuzhiyunerr4;	stvx	v14,r3,r15
619*4882a593Smuzhiyunerr4;	stvx	v15,r3,r16
620*4882a593Smuzhiyun	addi	r3,r3,128
621*4882a593Smuzhiyun	bdnz	8b
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
624*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
625*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun	/* Up to 127B to go */
628*4882a593Smuzhiyun	clrldi	r5,r5,(64-7)
629*4882a593Smuzhiyun	srdi	r6,r5,4
630*4882a593Smuzhiyun	mtocrf	0x01,r6
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun	bf	cr7*4+1,9f
633*4882a593Smuzhiyunerr3;	lvx	v3,0,r4
634*4882a593Smuzhiyun	VPERM(v8,v0,v3,v16)
635*4882a593Smuzhiyunerr3;	lvx	v2,r4,r9
636*4882a593Smuzhiyun	VPERM(v9,v3,v2,v16)
637*4882a593Smuzhiyunerr3;	lvx	v1,r4,r10
638*4882a593Smuzhiyun	VPERM(v10,v2,v1,v16)
639*4882a593Smuzhiyunerr3;	lvx	v0,r4,r11
640*4882a593Smuzhiyun	VPERM(v11,v1,v0,v16)
641*4882a593Smuzhiyun	addi	r4,r4,64
642*4882a593Smuzhiyunerr3;	stvx	v8,0,r3
643*4882a593Smuzhiyunerr3;	stvx	v9,r3,r9
644*4882a593Smuzhiyunerr3;	stvx	v10,r3,r10
645*4882a593Smuzhiyunerr3;	stvx	v11,r3,r11
646*4882a593Smuzhiyun	addi	r3,r3,64
647*4882a593Smuzhiyun
648*4882a593Smuzhiyun9:	bf	cr7*4+2,10f
649*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
650*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
651*4882a593Smuzhiyunerr3;	lvx	v0,r4,r9
652*4882a593Smuzhiyun	VPERM(v9,v1,v0,v16)
653*4882a593Smuzhiyun	addi	r4,r4,32
654*4882a593Smuzhiyunerr3;	stvx	v8,0,r3
655*4882a593Smuzhiyunerr3;	stvx	v9,r3,r9
656*4882a593Smuzhiyun	addi	r3,r3,32
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun10:	bf	cr7*4+3,11f
659*4882a593Smuzhiyunerr3;	lvx	v1,0,r4
660*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
661*4882a593Smuzhiyun	addi	r4,r4,16
662*4882a593Smuzhiyunerr3;	stvx	v8,0,r3
663*4882a593Smuzhiyun	addi	r3,r3,16
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun	/* Up to 15B to go */
666*4882a593Smuzhiyun11:	clrldi	r5,r5,(64-4)
667*4882a593Smuzhiyun	addi	r4,r4,-16	/* Unwind the +16 load offset */
668*4882a593Smuzhiyun	mtocrf	0x01,r5
669*4882a593Smuzhiyun	bf	cr7*4+0,12f
670*4882a593Smuzhiyunerr3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
671*4882a593Smuzhiyunerr3;	lwz	r6,4(r4)
672*4882a593Smuzhiyun	addi	r4,r4,8
673*4882a593Smuzhiyunerr3;	stw	r0,0(r3)
674*4882a593Smuzhiyunerr3;	stw	r6,4(r3)
675*4882a593Smuzhiyun	addi	r3,r3,8
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun12:	bf	cr7*4+1,13f
678*4882a593Smuzhiyunerr3;	lwz	r0,0(r4)
679*4882a593Smuzhiyun	addi	r4,r4,4
680*4882a593Smuzhiyunerr3;	stw	r0,0(r3)
681*4882a593Smuzhiyun	addi	r3,r3,4
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun13:	bf	cr7*4+2,14f
684*4882a593Smuzhiyunerr3;	lhz	r0,0(r4)
685*4882a593Smuzhiyun	addi	r4,r4,2
686*4882a593Smuzhiyunerr3;	sth	r0,0(r3)
687*4882a593Smuzhiyun	addi	r3,r3,2
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun14:	bf	cr7*4+3,15f
690*4882a593Smuzhiyunerr3;	lbz	r0,0(r4)
691*4882a593Smuzhiyunerr3;	stb	r0,0(r3)
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun15:	addi	r1,r1,STACKFRAMESIZE
694*4882a593Smuzhiyun	b	exit_vmx_usercopy	/* tail call optimise */
695*4882a593Smuzhiyun#endif /* CONFIG_ALTIVEC */
696