xref: /OK3568_Linux_fs/kernel/arch/powerpc/lib/memcpy_power7.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) IBM Corporation, 2012
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Author: Anton Blanchard <anton@au.ibm.com>
7*4882a593Smuzhiyun */
8*4882a593Smuzhiyun#include <asm/ppc_asm.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#ifndef SELFTEST_CASE
11*4882a593Smuzhiyun/* 0 == don't use VMX, 1 == use VMX */
12*4882a593Smuzhiyun#define SELFTEST_CASE	0
13*4882a593Smuzhiyun#endif
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__
16*4882a593Smuzhiyun#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
17*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
18*4882a593Smuzhiyun#else
19*4882a593Smuzhiyun#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
20*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
21*4882a593Smuzhiyun#endif
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun_GLOBAL(memcpy_power7)
24*4882a593Smuzhiyun	cmpldi	r5,16
25*4882a593Smuzhiyun	cmpldi	cr1,r5,4096
26*4882a593Smuzhiyun	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27*4882a593Smuzhiyun	blt	.Lshort_copy
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
30*4882a593Smuzhiyuntest_feature = SELFTEST_CASE
31*4882a593SmuzhiyunBEGIN_FTR_SECTION
32*4882a593Smuzhiyun	bgt	cr1, .Lvmx_copy
33*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34*4882a593Smuzhiyun#endif
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun.Lnonvmx_copy:
37*4882a593Smuzhiyun	/* Get the source 8B aligned */
38*4882a593Smuzhiyun	neg	r6,r4
39*4882a593Smuzhiyun	mtocrf	0x01,r6
40*4882a593Smuzhiyun	clrldi	r6,r6,(64-3)
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	bf	cr7*4+3,1f
43*4882a593Smuzhiyun	lbz	r0,0(r4)
44*4882a593Smuzhiyun	addi	r4,r4,1
45*4882a593Smuzhiyun	stb	r0,0(r3)
46*4882a593Smuzhiyun	addi	r3,r3,1
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun1:	bf	cr7*4+2,2f
49*4882a593Smuzhiyun	lhz	r0,0(r4)
50*4882a593Smuzhiyun	addi	r4,r4,2
51*4882a593Smuzhiyun	sth	r0,0(r3)
52*4882a593Smuzhiyun	addi	r3,r3,2
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun2:	bf	cr7*4+1,3f
55*4882a593Smuzhiyun	lwz	r0,0(r4)
56*4882a593Smuzhiyun	addi	r4,r4,4
57*4882a593Smuzhiyun	stw	r0,0(r3)
58*4882a593Smuzhiyun	addi	r3,r3,4
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun3:	sub	r5,r5,r6
61*4882a593Smuzhiyun	cmpldi	r5,128
62*4882a593Smuzhiyun	blt	5f
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun	mflr	r0
65*4882a593Smuzhiyun	stdu	r1,-STACKFRAMESIZE(r1)
66*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
67*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
68*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
69*4882a593Smuzhiyun	std	r17,STK_REG(R17)(r1)
70*4882a593Smuzhiyun	std	r18,STK_REG(R18)(r1)
71*4882a593Smuzhiyun	std	r19,STK_REG(R19)(r1)
72*4882a593Smuzhiyun	std	r20,STK_REG(R20)(r1)
73*4882a593Smuzhiyun	std	r21,STK_REG(R21)(r1)
74*4882a593Smuzhiyun	std	r22,STK_REG(R22)(r1)
75*4882a593Smuzhiyun	std	r0,STACKFRAMESIZE+16(r1)
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun	srdi	r6,r5,7
78*4882a593Smuzhiyun	mtctr	r6
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	/* Now do cacheline (128B) sized loads and stores. */
81*4882a593Smuzhiyun	.align	5
82*4882a593Smuzhiyun4:
83*4882a593Smuzhiyun	ld	r0,0(r4)
84*4882a593Smuzhiyun	ld	r6,8(r4)
85*4882a593Smuzhiyun	ld	r7,16(r4)
86*4882a593Smuzhiyun	ld	r8,24(r4)
87*4882a593Smuzhiyun	ld	r9,32(r4)
88*4882a593Smuzhiyun	ld	r10,40(r4)
89*4882a593Smuzhiyun	ld	r11,48(r4)
90*4882a593Smuzhiyun	ld	r12,56(r4)
91*4882a593Smuzhiyun	ld	r14,64(r4)
92*4882a593Smuzhiyun	ld	r15,72(r4)
93*4882a593Smuzhiyun	ld	r16,80(r4)
94*4882a593Smuzhiyun	ld	r17,88(r4)
95*4882a593Smuzhiyun	ld	r18,96(r4)
96*4882a593Smuzhiyun	ld	r19,104(r4)
97*4882a593Smuzhiyun	ld	r20,112(r4)
98*4882a593Smuzhiyun	ld	r21,120(r4)
99*4882a593Smuzhiyun	addi	r4,r4,128
100*4882a593Smuzhiyun	std	r0,0(r3)
101*4882a593Smuzhiyun	std	r6,8(r3)
102*4882a593Smuzhiyun	std	r7,16(r3)
103*4882a593Smuzhiyun	std	r8,24(r3)
104*4882a593Smuzhiyun	std	r9,32(r3)
105*4882a593Smuzhiyun	std	r10,40(r3)
106*4882a593Smuzhiyun	std	r11,48(r3)
107*4882a593Smuzhiyun	std	r12,56(r3)
108*4882a593Smuzhiyun	std	r14,64(r3)
109*4882a593Smuzhiyun	std	r15,72(r3)
110*4882a593Smuzhiyun	std	r16,80(r3)
111*4882a593Smuzhiyun	std	r17,88(r3)
112*4882a593Smuzhiyun	std	r18,96(r3)
113*4882a593Smuzhiyun	std	r19,104(r3)
114*4882a593Smuzhiyun	std	r20,112(r3)
115*4882a593Smuzhiyun	std	r21,120(r3)
116*4882a593Smuzhiyun	addi	r3,r3,128
117*4882a593Smuzhiyun	bdnz	4b
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun	clrldi	r5,r5,(64-7)
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
122*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
123*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
124*4882a593Smuzhiyun	ld	r17,STK_REG(R17)(r1)
125*4882a593Smuzhiyun	ld	r18,STK_REG(R18)(r1)
126*4882a593Smuzhiyun	ld	r19,STK_REG(R19)(r1)
127*4882a593Smuzhiyun	ld	r20,STK_REG(R20)(r1)
128*4882a593Smuzhiyun	ld	r21,STK_REG(R21)(r1)
129*4882a593Smuzhiyun	ld	r22,STK_REG(R22)(r1)
130*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	/* Up to 127B to go */
133*4882a593Smuzhiyun5:	srdi	r6,r5,4
134*4882a593Smuzhiyun	mtocrf	0x01,r6
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun6:	bf	cr7*4+1,7f
137*4882a593Smuzhiyun	ld	r0,0(r4)
138*4882a593Smuzhiyun	ld	r6,8(r4)
139*4882a593Smuzhiyun	ld	r7,16(r4)
140*4882a593Smuzhiyun	ld	r8,24(r4)
141*4882a593Smuzhiyun	ld	r9,32(r4)
142*4882a593Smuzhiyun	ld	r10,40(r4)
143*4882a593Smuzhiyun	ld	r11,48(r4)
144*4882a593Smuzhiyun	ld	r12,56(r4)
145*4882a593Smuzhiyun	addi	r4,r4,64
146*4882a593Smuzhiyun	std	r0,0(r3)
147*4882a593Smuzhiyun	std	r6,8(r3)
148*4882a593Smuzhiyun	std	r7,16(r3)
149*4882a593Smuzhiyun	std	r8,24(r3)
150*4882a593Smuzhiyun	std	r9,32(r3)
151*4882a593Smuzhiyun	std	r10,40(r3)
152*4882a593Smuzhiyun	std	r11,48(r3)
153*4882a593Smuzhiyun	std	r12,56(r3)
154*4882a593Smuzhiyun	addi	r3,r3,64
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun	/* Up to 63B to go */
157*4882a593Smuzhiyun7:	bf	cr7*4+2,8f
158*4882a593Smuzhiyun	ld	r0,0(r4)
159*4882a593Smuzhiyun	ld	r6,8(r4)
160*4882a593Smuzhiyun	ld	r7,16(r4)
161*4882a593Smuzhiyun	ld	r8,24(r4)
162*4882a593Smuzhiyun	addi	r4,r4,32
163*4882a593Smuzhiyun	std	r0,0(r3)
164*4882a593Smuzhiyun	std	r6,8(r3)
165*4882a593Smuzhiyun	std	r7,16(r3)
166*4882a593Smuzhiyun	std	r8,24(r3)
167*4882a593Smuzhiyun	addi	r3,r3,32
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun	/* Up to 31B to go */
170*4882a593Smuzhiyun8:	bf	cr7*4+3,9f
171*4882a593Smuzhiyun	ld	r0,0(r4)
172*4882a593Smuzhiyun	ld	r6,8(r4)
173*4882a593Smuzhiyun	addi	r4,r4,16
174*4882a593Smuzhiyun	std	r0,0(r3)
175*4882a593Smuzhiyun	std	r6,8(r3)
176*4882a593Smuzhiyun	addi	r3,r3,16
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun9:	clrldi	r5,r5,(64-4)
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun	/* Up to 15B to go */
181*4882a593Smuzhiyun.Lshort_copy:
182*4882a593Smuzhiyun	mtocrf	0x01,r5
183*4882a593Smuzhiyun	bf	cr7*4+0,12f
184*4882a593Smuzhiyun	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
185*4882a593Smuzhiyun	lwz	r6,4(r4)
186*4882a593Smuzhiyun	addi	r4,r4,8
187*4882a593Smuzhiyun	stw	r0,0(r3)
188*4882a593Smuzhiyun	stw	r6,4(r3)
189*4882a593Smuzhiyun	addi	r3,r3,8
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun12:	bf	cr7*4+1,13f
192*4882a593Smuzhiyun	lwz	r0,0(r4)
193*4882a593Smuzhiyun	addi	r4,r4,4
194*4882a593Smuzhiyun	stw	r0,0(r3)
195*4882a593Smuzhiyun	addi	r3,r3,4
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun13:	bf	cr7*4+2,14f
198*4882a593Smuzhiyun	lhz	r0,0(r4)
199*4882a593Smuzhiyun	addi	r4,r4,2
200*4882a593Smuzhiyun	sth	r0,0(r3)
201*4882a593Smuzhiyun	addi	r3,r3,2
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun14:	bf	cr7*4+3,15f
204*4882a593Smuzhiyun	lbz	r0,0(r4)
205*4882a593Smuzhiyun	stb	r0,0(r3)
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun15:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208*4882a593Smuzhiyun	blr
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun.Lunwind_stack_nonvmx_copy:
211*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
212*4882a593Smuzhiyun	b	.Lnonvmx_copy
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun.Lvmx_copy:
215*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
216*4882a593Smuzhiyun	mflr	r0
217*4882a593Smuzhiyun	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218*4882a593Smuzhiyun	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219*4882a593Smuzhiyun	std	r0,16(r1)
220*4882a593Smuzhiyun	stdu	r1,-STACKFRAMESIZE(r1)
221*4882a593Smuzhiyun	bl	enter_vmx_ops
222*4882a593Smuzhiyun	cmpwi	cr1,r3,0
223*4882a593Smuzhiyun	ld	r0,STACKFRAMESIZE+16(r1)
224*4882a593Smuzhiyun	ld	r3,STK_REG(R31)(r1)
225*4882a593Smuzhiyun	ld	r4,STK_REG(R30)(r1)
226*4882a593Smuzhiyun	ld	r5,STK_REG(R29)(r1)
227*4882a593Smuzhiyun	mtlr	r0
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun	/*
230*4882a593Smuzhiyun	 * We prefetch both the source and destination using enhanced touch
231*4882a593Smuzhiyun	 * instructions. We use a stream ID of 0 for the load side and
232*4882a593Smuzhiyun	 * 1 for the store side.
233*4882a593Smuzhiyun	 */
234*4882a593Smuzhiyun	clrrdi	r6,r4,7
235*4882a593Smuzhiyun	clrrdi	r9,r3,7
236*4882a593Smuzhiyun	ori	r9,r9,1		/* stream=1 */
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
239*4882a593Smuzhiyun	cmpldi	r7,0x3FF
240*4882a593Smuzhiyun	ble	1f
241*4882a593Smuzhiyun	li	r7,0x3FF
242*4882a593Smuzhiyun1:	lis	r0,0x0E00	/* depth=7 */
243*4882a593Smuzhiyun	sldi	r7,r7,7
244*4882a593Smuzhiyun	or	r7,r7,r0
245*4882a593Smuzhiyun	ori	r10,r7,1	/* stream=1 */
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	lis	r8,0x8000	/* GO=1 */
248*4882a593Smuzhiyun	clrldi	r8,r8,32
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun	dcbt	0,r6,0b01000
251*4882a593Smuzhiyun	dcbt	0,r7,0b01010
252*4882a593Smuzhiyun	dcbtst	0,r9,0b01000
253*4882a593Smuzhiyun	dcbtst	0,r10,0b01010
254*4882a593Smuzhiyun	eieio
255*4882a593Smuzhiyun	dcbt	0,r8,0b01010	/* GO */
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun	beq	cr1,.Lunwind_stack_nonvmx_copy
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun	/*
260*4882a593Smuzhiyun	 * If source and destination are not relatively aligned we use a
261*4882a593Smuzhiyun	 * slower permute loop.
262*4882a593Smuzhiyun	 */
263*4882a593Smuzhiyun	xor	r6,r4,r3
264*4882a593Smuzhiyun	rldicl.	r6,r6,0,(64-4)
265*4882a593Smuzhiyun	bne	.Lvmx_unaligned_copy
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	/* Get the destination 16B aligned */
268*4882a593Smuzhiyun	neg	r6,r3
269*4882a593Smuzhiyun	mtocrf	0x01,r6
270*4882a593Smuzhiyun	clrldi	r6,r6,(64-4)
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun	bf	cr7*4+3,1f
273*4882a593Smuzhiyun	lbz	r0,0(r4)
274*4882a593Smuzhiyun	addi	r4,r4,1
275*4882a593Smuzhiyun	stb	r0,0(r3)
276*4882a593Smuzhiyun	addi	r3,r3,1
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun1:	bf	cr7*4+2,2f
279*4882a593Smuzhiyun	lhz	r0,0(r4)
280*4882a593Smuzhiyun	addi	r4,r4,2
281*4882a593Smuzhiyun	sth	r0,0(r3)
282*4882a593Smuzhiyun	addi	r3,r3,2
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun2:	bf	cr7*4+1,3f
285*4882a593Smuzhiyun	lwz	r0,0(r4)
286*4882a593Smuzhiyun	addi	r4,r4,4
287*4882a593Smuzhiyun	stw	r0,0(r3)
288*4882a593Smuzhiyun	addi	r3,r3,4
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun3:	bf	cr7*4+0,4f
291*4882a593Smuzhiyun	ld	r0,0(r4)
292*4882a593Smuzhiyun	addi	r4,r4,8
293*4882a593Smuzhiyun	std	r0,0(r3)
294*4882a593Smuzhiyun	addi	r3,r3,8
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun4:	sub	r5,r5,r6
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun	/* Get the desination 128B aligned */
299*4882a593Smuzhiyun	neg	r6,r3
300*4882a593Smuzhiyun	srdi	r7,r6,4
301*4882a593Smuzhiyun	mtocrf	0x01,r7
302*4882a593Smuzhiyun	clrldi	r6,r6,(64-7)
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun	li	r9,16
305*4882a593Smuzhiyun	li	r10,32
306*4882a593Smuzhiyun	li	r11,48
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun	bf	cr7*4+3,5f
309*4882a593Smuzhiyun	lvx	v1,0,r4
310*4882a593Smuzhiyun	addi	r4,r4,16
311*4882a593Smuzhiyun	stvx	v1,0,r3
312*4882a593Smuzhiyun	addi	r3,r3,16
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun5:	bf	cr7*4+2,6f
315*4882a593Smuzhiyun	lvx	v1,0,r4
316*4882a593Smuzhiyun	lvx	v0,r4,r9
317*4882a593Smuzhiyun	addi	r4,r4,32
318*4882a593Smuzhiyun	stvx	v1,0,r3
319*4882a593Smuzhiyun	stvx	v0,r3,r9
320*4882a593Smuzhiyun	addi	r3,r3,32
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun6:	bf	cr7*4+1,7f
323*4882a593Smuzhiyun	lvx	v3,0,r4
324*4882a593Smuzhiyun	lvx	v2,r4,r9
325*4882a593Smuzhiyun	lvx	v1,r4,r10
326*4882a593Smuzhiyun	lvx	v0,r4,r11
327*4882a593Smuzhiyun	addi	r4,r4,64
328*4882a593Smuzhiyun	stvx	v3,0,r3
329*4882a593Smuzhiyun	stvx	v2,r3,r9
330*4882a593Smuzhiyun	stvx	v1,r3,r10
331*4882a593Smuzhiyun	stvx	v0,r3,r11
332*4882a593Smuzhiyun	addi	r3,r3,64
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun7:	sub	r5,r5,r6
335*4882a593Smuzhiyun	srdi	r6,r5,7
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
338*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
339*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun	li	r12,64
342*4882a593Smuzhiyun	li	r14,80
343*4882a593Smuzhiyun	li	r15,96
344*4882a593Smuzhiyun	li	r16,112
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun	mtctr	r6
347*4882a593Smuzhiyun
348*4882a593Smuzhiyun	/*
349*4882a593Smuzhiyun	 * Now do cacheline sized loads and stores. By this stage the
350*4882a593Smuzhiyun	 * cacheline stores are also cacheline aligned.
351*4882a593Smuzhiyun	 */
352*4882a593Smuzhiyun	.align	5
353*4882a593Smuzhiyun8:
354*4882a593Smuzhiyun	lvx	v7,0,r4
355*4882a593Smuzhiyun	lvx	v6,r4,r9
356*4882a593Smuzhiyun	lvx	v5,r4,r10
357*4882a593Smuzhiyun	lvx	v4,r4,r11
358*4882a593Smuzhiyun	lvx	v3,r4,r12
359*4882a593Smuzhiyun	lvx	v2,r4,r14
360*4882a593Smuzhiyun	lvx	v1,r4,r15
361*4882a593Smuzhiyun	lvx	v0,r4,r16
362*4882a593Smuzhiyun	addi	r4,r4,128
363*4882a593Smuzhiyun	stvx	v7,0,r3
364*4882a593Smuzhiyun	stvx	v6,r3,r9
365*4882a593Smuzhiyun	stvx	v5,r3,r10
366*4882a593Smuzhiyun	stvx	v4,r3,r11
367*4882a593Smuzhiyun	stvx	v3,r3,r12
368*4882a593Smuzhiyun	stvx	v2,r3,r14
369*4882a593Smuzhiyun	stvx	v1,r3,r15
370*4882a593Smuzhiyun	stvx	v0,r3,r16
371*4882a593Smuzhiyun	addi	r3,r3,128
372*4882a593Smuzhiyun	bdnz	8b
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
375*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
376*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun	/* Up to 127B to go */
379*4882a593Smuzhiyun	clrldi	r5,r5,(64-7)
380*4882a593Smuzhiyun	srdi	r6,r5,4
381*4882a593Smuzhiyun	mtocrf	0x01,r6
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun	bf	cr7*4+1,9f
384*4882a593Smuzhiyun	lvx	v3,0,r4
385*4882a593Smuzhiyun	lvx	v2,r4,r9
386*4882a593Smuzhiyun	lvx	v1,r4,r10
387*4882a593Smuzhiyun	lvx	v0,r4,r11
388*4882a593Smuzhiyun	addi	r4,r4,64
389*4882a593Smuzhiyun	stvx	v3,0,r3
390*4882a593Smuzhiyun	stvx	v2,r3,r9
391*4882a593Smuzhiyun	stvx	v1,r3,r10
392*4882a593Smuzhiyun	stvx	v0,r3,r11
393*4882a593Smuzhiyun	addi	r3,r3,64
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun9:	bf	cr7*4+2,10f
396*4882a593Smuzhiyun	lvx	v1,0,r4
397*4882a593Smuzhiyun	lvx	v0,r4,r9
398*4882a593Smuzhiyun	addi	r4,r4,32
399*4882a593Smuzhiyun	stvx	v1,0,r3
400*4882a593Smuzhiyun	stvx	v0,r3,r9
401*4882a593Smuzhiyun	addi	r3,r3,32
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun10:	bf	cr7*4+3,11f
404*4882a593Smuzhiyun	lvx	v1,0,r4
405*4882a593Smuzhiyun	addi	r4,r4,16
406*4882a593Smuzhiyun	stvx	v1,0,r3
407*4882a593Smuzhiyun	addi	r3,r3,16
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun	/* Up to 15B to go */
410*4882a593Smuzhiyun11:	clrldi	r5,r5,(64-4)
411*4882a593Smuzhiyun	mtocrf	0x01,r5
412*4882a593Smuzhiyun	bf	cr7*4+0,12f
413*4882a593Smuzhiyun	ld	r0,0(r4)
414*4882a593Smuzhiyun	addi	r4,r4,8
415*4882a593Smuzhiyun	std	r0,0(r3)
416*4882a593Smuzhiyun	addi	r3,r3,8
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun12:	bf	cr7*4+1,13f
419*4882a593Smuzhiyun	lwz	r0,0(r4)
420*4882a593Smuzhiyun	addi	r4,r4,4
421*4882a593Smuzhiyun	stw	r0,0(r3)
422*4882a593Smuzhiyun	addi	r3,r3,4
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun13:	bf	cr7*4+2,14f
425*4882a593Smuzhiyun	lhz	r0,0(r4)
426*4882a593Smuzhiyun	addi	r4,r4,2
427*4882a593Smuzhiyun	sth	r0,0(r3)
428*4882a593Smuzhiyun	addi	r3,r3,2
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun14:	bf	cr7*4+3,15f
431*4882a593Smuzhiyun	lbz	r0,0(r4)
432*4882a593Smuzhiyun	stb	r0,0(r3)
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun15:	addi	r1,r1,STACKFRAMESIZE
435*4882a593Smuzhiyun	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
436*4882a593Smuzhiyun	b	exit_vmx_ops		/* tail call optimise */
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun.Lvmx_unaligned_copy:
439*4882a593Smuzhiyun	/* Get the destination 16B aligned */
440*4882a593Smuzhiyun	neg	r6,r3
441*4882a593Smuzhiyun	mtocrf	0x01,r6
442*4882a593Smuzhiyun	clrldi	r6,r6,(64-4)
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun	bf	cr7*4+3,1f
445*4882a593Smuzhiyun	lbz	r0,0(r4)
446*4882a593Smuzhiyun	addi	r4,r4,1
447*4882a593Smuzhiyun	stb	r0,0(r3)
448*4882a593Smuzhiyun	addi	r3,r3,1
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun1:	bf	cr7*4+2,2f
451*4882a593Smuzhiyun	lhz	r0,0(r4)
452*4882a593Smuzhiyun	addi	r4,r4,2
453*4882a593Smuzhiyun	sth	r0,0(r3)
454*4882a593Smuzhiyun	addi	r3,r3,2
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun2:	bf	cr7*4+1,3f
457*4882a593Smuzhiyun	lwz	r0,0(r4)
458*4882a593Smuzhiyun	addi	r4,r4,4
459*4882a593Smuzhiyun	stw	r0,0(r3)
460*4882a593Smuzhiyun	addi	r3,r3,4
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun3:	bf	cr7*4+0,4f
463*4882a593Smuzhiyun	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
464*4882a593Smuzhiyun	lwz	r7,4(r4)
465*4882a593Smuzhiyun	addi	r4,r4,8
466*4882a593Smuzhiyun	stw	r0,0(r3)
467*4882a593Smuzhiyun	stw	r7,4(r3)
468*4882a593Smuzhiyun	addi	r3,r3,8
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun4:	sub	r5,r5,r6
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun	/* Get the desination 128B aligned */
473*4882a593Smuzhiyun	neg	r6,r3
474*4882a593Smuzhiyun	srdi	r7,r6,4
475*4882a593Smuzhiyun	mtocrf	0x01,r7
476*4882a593Smuzhiyun	clrldi	r6,r6,(64-7)
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun	li	r9,16
479*4882a593Smuzhiyun	li	r10,32
480*4882a593Smuzhiyun	li	r11,48
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun	LVS(v16,0,r4)		/* Setup permute control vector */
483*4882a593Smuzhiyun	lvx	v0,0,r4
484*4882a593Smuzhiyun	addi	r4,r4,16
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun	bf	cr7*4+3,5f
487*4882a593Smuzhiyun	lvx	v1,0,r4
488*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
489*4882a593Smuzhiyun	addi	r4,r4,16
490*4882a593Smuzhiyun	stvx	v8,0,r3
491*4882a593Smuzhiyun	addi	r3,r3,16
492*4882a593Smuzhiyun	vor	v0,v1,v1
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun5:	bf	cr7*4+2,6f
495*4882a593Smuzhiyun	lvx	v1,0,r4
496*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
497*4882a593Smuzhiyun	lvx	v0,r4,r9
498*4882a593Smuzhiyun	VPERM(v9,v1,v0,v16)
499*4882a593Smuzhiyun	addi	r4,r4,32
500*4882a593Smuzhiyun	stvx	v8,0,r3
501*4882a593Smuzhiyun	stvx	v9,r3,r9
502*4882a593Smuzhiyun	addi	r3,r3,32
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun6:	bf	cr7*4+1,7f
505*4882a593Smuzhiyun	lvx	v3,0,r4
506*4882a593Smuzhiyun	VPERM(v8,v0,v3,v16)
507*4882a593Smuzhiyun	lvx	v2,r4,r9
508*4882a593Smuzhiyun	VPERM(v9,v3,v2,v16)
509*4882a593Smuzhiyun	lvx	v1,r4,r10
510*4882a593Smuzhiyun	VPERM(v10,v2,v1,v16)
511*4882a593Smuzhiyun	lvx	v0,r4,r11
512*4882a593Smuzhiyun	VPERM(v11,v1,v0,v16)
513*4882a593Smuzhiyun	addi	r4,r4,64
514*4882a593Smuzhiyun	stvx	v8,0,r3
515*4882a593Smuzhiyun	stvx	v9,r3,r9
516*4882a593Smuzhiyun	stvx	v10,r3,r10
517*4882a593Smuzhiyun	stvx	v11,r3,r11
518*4882a593Smuzhiyun	addi	r3,r3,64
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun7:	sub	r5,r5,r6
521*4882a593Smuzhiyun	srdi	r6,r5,7
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
524*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
525*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
526*4882a593Smuzhiyun
527*4882a593Smuzhiyun	li	r12,64
528*4882a593Smuzhiyun	li	r14,80
529*4882a593Smuzhiyun	li	r15,96
530*4882a593Smuzhiyun	li	r16,112
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun	mtctr	r6
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun	/*
535*4882a593Smuzhiyun	 * Now do cacheline sized loads and stores. By this stage the
536*4882a593Smuzhiyun	 * cacheline stores are also cacheline aligned.
537*4882a593Smuzhiyun	 */
538*4882a593Smuzhiyun	.align	5
539*4882a593Smuzhiyun8:
540*4882a593Smuzhiyun	lvx	v7,0,r4
541*4882a593Smuzhiyun	VPERM(v8,v0,v7,v16)
542*4882a593Smuzhiyun	lvx	v6,r4,r9
543*4882a593Smuzhiyun	VPERM(v9,v7,v6,v16)
544*4882a593Smuzhiyun	lvx	v5,r4,r10
545*4882a593Smuzhiyun	VPERM(v10,v6,v5,v16)
546*4882a593Smuzhiyun	lvx	v4,r4,r11
547*4882a593Smuzhiyun	VPERM(v11,v5,v4,v16)
548*4882a593Smuzhiyun	lvx	v3,r4,r12
549*4882a593Smuzhiyun	VPERM(v12,v4,v3,v16)
550*4882a593Smuzhiyun	lvx	v2,r4,r14
551*4882a593Smuzhiyun	VPERM(v13,v3,v2,v16)
552*4882a593Smuzhiyun	lvx	v1,r4,r15
553*4882a593Smuzhiyun	VPERM(v14,v2,v1,v16)
554*4882a593Smuzhiyun	lvx	v0,r4,r16
555*4882a593Smuzhiyun	VPERM(v15,v1,v0,v16)
556*4882a593Smuzhiyun	addi	r4,r4,128
557*4882a593Smuzhiyun	stvx	v8,0,r3
558*4882a593Smuzhiyun	stvx	v9,r3,r9
559*4882a593Smuzhiyun	stvx	v10,r3,r10
560*4882a593Smuzhiyun	stvx	v11,r3,r11
561*4882a593Smuzhiyun	stvx	v12,r3,r12
562*4882a593Smuzhiyun	stvx	v13,r3,r14
563*4882a593Smuzhiyun	stvx	v14,r3,r15
564*4882a593Smuzhiyun	stvx	v15,r3,r16
565*4882a593Smuzhiyun	addi	r3,r3,128
566*4882a593Smuzhiyun	bdnz	8b
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
569*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
570*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun	/* Up to 127B to go */
573*4882a593Smuzhiyun	clrldi	r5,r5,(64-7)
574*4882a593Smuzhiyun	srdi	r6,r5,4
575*4882a593Smuzhiyun	mtocrf	0x01,r6
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun	bf	cr7*4+1,9f
578*4882a593Smuzhiyun	lvx	v3,0,r4
579*4882a593Smuzhiyun	VPERM(v8,v0,v3,v16)
580*4882a593Smuzhiyun	lvx	v2,r4,r9
581*4882a593Smuzhiyun	VPERM(v9,v3,v2,v16)
582*4882a593Smuzhiyun	lvx	v1,r4,r10
583*4882a593Smuzhiyun	VPERM(v10,v2,v1,v16)
584*4882a593Smuzhiyun	lvx	v0,r4,r11
585*4882a593Smuzhiyun	VPERM(v11,v1,v0,v16)
586*4882a593Smuzhiyun	addi	r4,r4,64
587*4882a593Smuzhiyun	stvx	v8,0,r3
588*4882a593Smuzhiyun	stvx	v9,r3,r9
589*4882a593Smuzhiyun	stvx	v10,r3,r10
590*4882a593Smuzhiyun	stvx	v11,r3,r11
591*4882a593Smuzhiyun	addi	r3,r3,64
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun9:	bf	cr7*4+2,10f
594*4882a593Smuzhiyun	lvx	v1,0,r4
595*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
596*4882a593Smuzhiyun	lvx	v0,r4,r9
597*4882a593Smuzhiyun	VPERM(v9,v1,v0,v16)
598*4882a593Smuzhiyun	addi	r4,r4,32
599*4882a593Smuzhiyun	stvx	v8,0,r3
600*4882a593Smuzhiyun	stvx	v9,r3,r9
601*4882a593Smuzhiyun	addi	r3,r3,32
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun10:	bf	cr7*4+3,11f
604*4882a593Smuzhiyun	lvx	v1,0,r4
605*4882a593Smuzhiyun	VPERM(v8,v0,v1,v16)
606*4882a593Smuzhiyun	addi	r4,r4,16
607*4882a593Smuzhiyun	stvx	v8,0,r3
608*4882a593Smuzhiyun	addi	r3,r3,16
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun	/* Up to 15B to go */
611*4882a593Smuzhiyun11:	clrldi	r5,r5,(64-4)
612*4882a593Smuzhiyun	addi	r4,r4,-16	/* Unwind the +16 load offset */
613*4882a593Smuzhiyun	mtocrf	0x01,r5
614*4882a593Smuzhiyun	bf	cr7*4+0,12f
615*4882a593Smuzhiyun	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
616*4882a593Smuzhiyun	lwz	r6,4(r4)
617*4882a593Smuzhiyun	addi	r4,r4,8
618*4882a593Smuzhiyun	stw	r0,0(r3)
619*4882a593Smuzhiyun	stw	r6,4(r3)
620*4882a593Smuzhiyun	addi	r3,r3,8
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun12:	bf	cr7*4+1,13f
623*4882a593Smuzhiyun	lwz	r0,0(r4)
624*4882a593Smuzhiyun	addi	r4,r4,4
625*4882a593Smuzhiyun	stw	r0,0(r3)
626*4882a593Smuzhiyun	addi	r3,r3,4
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun13:	bf	cr7*4+2,14f
629*4882a593Smuzhiyun	lhz	r0,0(r4)
630*4882a593Smuzhiyun	addi	r4,r4,2
631*4882a593Smuzhiyun	sth	r0,0(r3)
632*4882a593Smuzhiyun	addi	r3,r3,2
633*4882a593Smuzhiyun
634*4882a593Smuzhiyun14:	bf	cr7*4+3,15f
635*4882a593Smuzhiyun	lbz	r0,0(r4)
636*4882a593Smuzhiyun	stb	r0,0(r3)
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun15:	addi	r1,r1,STACKFRAMESIZE
639*4882a593Smuzhiyun	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
640*4882a593Smuzhiyun	b	exit_vmx_ops		/* tail call optimise */
641*4882a593Smuzhiyun#endif /* CONFIG_ALTIVEC */
642