xref: /OK3568_Linux_fs/kernel/arch/powerpc/lib/memcmp_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Author: Anton Blanchard <anton@au.ibm.com>
4*4882a593Smuzhiyun * Copyright 2015 IBM Corporation.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun#include <asm/ppc_asm.h>
7*4882a593Smuzhiyun#include <asm/export.h>
8*4882a593Smuzhiyun#include <asm/ppc-opcode.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#define off8	r6
11*4882a593Smuzhiyun#define off16	r7
12*4882a593Smuzhiyun#define off24	r8
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun#define rA	r9
15*4882a593Smuzhiyun#define rB	r10
16*4882a593Smuzhiyun#define rC	r11
17*4882a593Smuzhiyun#define rD	r27
18*4882a593Smuzhiyun#define rE	r28
19*4882a593Smuzhiyun#define rF	r29
20*4882a593Smuzhiyun#define rG	r30
21*4882a593Smuzhiyun#define rH	r31
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__
24*4882a593Smuzhiyun#define LH	lhbrx
25*4882a593Smuzhiyun#define LW	lwbrx
26*4882a593Smuzhiyun#define LD	ldbrx
27*4882a593Smuzhiyun#define LVS	lvsr
28*4882a593Smuzhiyun#define VPERM(_VRT,_VRA,_VRB,_VRC) \
29*4882a593Smuzhiyun	vperm _VRT,_VRB,_VRA,_VRC
30*4882a593Smuzhiyun#else
31*4882a593Smuzhiyun#define LH	lhzx
32*4882a593Smuzhiyun#define LW	lwzx
33*4882a593Smuzhiyun#define LD	ldx
34*4882a593Smuzhiyun#define LVS	lvsl
35*4882a593Smuzhiyun#define VPERM(_VRT,_VRA,_VRB,_VRC) \
36*4882a593Smuzhiyun	vperm _VRT,_VRA,_VRB,_VRC
37*4882a593Smuzhiyun#endif
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define VMX_THRESH 4096
40*4882a593Smuzhiyun#define ENTER_VMX_OPS	\
41*4882a593Smuzhiyun	mflr    r0;	\
42*4882a593Smuzhiyun	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43*4882a593Smuzhiyun	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44*4882a593Smuzhiyun	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45*4882a593Smuzhiyun	std     r0,16(r1); \
46*4882a593Smuzhiyun	stdu    r1,-STACKFRAMESIZE(r1); \
47*4882a593Smuzhiyun	bl      enter_vmx_ops; \
48*4882a593Smuzhiyun	cmpwi   cr1,r3,0; \
49*4882a593Smuzhiyun	ld      r0,STACKFRAMESIZE+16(r1); \
50*4882a593Smuzhiyun	ld      r3,STK_REG(R31)(r1); \
51*4882a593Smuzhiyun	ld      r4,STK_REG(R30)(r1); \
52*4882a593Smuzhiyun	ld      r5,STK_REG(R29)(r1); \
53*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE; \
54*4882a593Smuzhiyun	mtlr    r0
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun#define EXIT_VMX_OPS \
57*4882a593Smuzhiyun	mflr    r0; \
58*4882a593Smuzhiyun	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59*4882a593Smuzhiyun	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60*4882a593Smuzhiyun	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61*4882a593Smuzhiyun	std     r0,16(r1); \
62*4882a593Smuzhiyun	stdu    r1,-STACKFRAMESIZE(r1); \
63*4882a593Smuzhiyun	bl      exit_vmx_ops; \
64*4882a593Smuzhiyun	ld      r0,STACKFRAMESIZE+16(r1); \
65*4882a593Smuzhiyun	ld      r3,STK_REG(R31)(r1); \
66*4882a593Smuzhiyun	ld      r4,STK_REG(R30)(r1); \
67*4882a593Smuzhiyun	ld      r5,STK_REG(R29)(r1); \
68*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE; \
69*4882a593Smuzhiyun	mtlr    r0
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun/*
72*4882a593Smuzhiyun * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73*4882a593Smuzhiyun * 16 bytes boundary and permute the result with the 1st 16 bytes.
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
76*4882a593Smuzhiyun *    ^                                  ^                                 ^
77*4882a593Smuzhiyun * 0xbbbb10                          0xbbbb20                          0xbbb30
78*4882a593Smuzhiyun *                                 ^
79*4882a593Smuzhiyun *                                _vaddr
80*4882a593Smuzhiyun *
81*4882a593Smuzhiyun *
82*4882a593Smuzhiyun * _vmask is the mask generated by LVS
83*4882a593Smuzhiyun * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84*4882a593Smuzhiyun *   for example: 0xyyyyyyyyyyyyy012 for big endian
85*4882a593Smuzhiyun * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86*4882a593Smuzhiyun *   for example: 0x3456789abcdefzzz for big endian
87*4882a593Smuzhiyun * The permute result is saved in _v_res.
88*4882a593Smuzhiyun *   for example: 0x0123456789abcdef for big endian.
89*4882a593Smuzhiyun */
90*4882a593Smuzhiyun#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91*4882a593Smuzhiyun        lvx     _v2nd_qw,_vaddr,off16; \
92*4882a593Smuzhiyun        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun/*
95*4882a593Smuzhiyun * There are 2 categories for memcmp:
96*4882a593Smuzhiyun * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97*4882a593Smuzhiyun * are named like .Lsameoffset_xxxx
98*4882a593Smuzhiyun * 2) src/dst has different offset to the 8 bytes boundary. The handlers
99*4882a593Smuzhiyun * are named like .Ldiffoffset_xxxx
100*4882a593Smuzhiyun */
101*4882a593Smuzhiyun_GLOBAL_TOC(memcmp)
102*4882a593Smuzhiyun	cmpdi	cr1,r5,0
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun	/* Use the short loop if the src/dst addresses are not
105*4882a593Smuzhiyun	 * with the same offset of 8 bytes align boundary.
106*4882a593Smuzhiyun	 */
107*4882a593Smuzhiyun	xor	r6,r3,r4
108*4882a593Smuzhiyun	andi.	r6,r6,7
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun	/* Fall back to short loop if compare at aligned addrs
111*4882a593Smuzhiyun	 * with less than 8 bytes.
112*4882a593Smuzhiyun	 */
113*4882a593Smuzhiyun	cmpdi   cr6,r5,7
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun	beq	cr1,.Lzero
116*4882a593Smuzhiyun	bgt	cr6,.Lno_short
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun.Lshort:
119*4882a593Smuzhiyun	mtctr	r5
120*4882a593Smuzhiyun1:	lbz	rA,0(r3)
121*4882a593Smuzhiyun	lbz	rB,0(r4)
122*4882a593Smuzhiyun	subf.	rC,rB,rA
123*4882a593Smuzhiyun	bne	.Lnon_zero
124*4882a593Smuzhiyun	bdz	.Lzero
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun	lbz	rA,1(r3)
127*4882a593Smuzhiyun	lbz	rB,1(r4)
128*4882a593Smuzhiyun	subf.	rC,rB,rA
129*4882a593Smuzhiyun	bne	.Lnon_zero
130*4882a593Smuzhiyun	bdz	.Lzero
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	lbz	rA,2(r3)
133*4882a593Smuzhiyun	lbz	rB,2(r4)
134*4882a593Smuzhiyun	subf.	rC,rB,rA
135*4882a593Smuzhiyun	bne	.Lnon_zero
136*4882a593Smuzhiyun	bdz	.Lzero
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun	lbz	rA,3(r3)
139*4882a593Smuzhiyun	lbz	rB,3(r4)
140*4882a593Smuzhiyun	subf.	rC,rB,rA
141*4882a593Smuzhiyun	bne	.Lnon_zero
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun	addi	r3,r3,4
144*4882a593Smuzhiyun	addi	r4,r4,4
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun	bdnz	1b
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun.Lzero:
149*4882a593Smuzhiyun	li	r3,0
150*4882a593Smuzhiyun	blr
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun.Lno_short:
153*4882a593Smuzhiyun	dcbt	0,r3
154*4882a593Smuzhiyun	dcbt	0,r4
155*4882a593Smuzhiyun	bne	.Ldiffoffset_8bytes_make_align_start
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun.Lsameoffset_8bytes_make_align_start:
159*4882a593Smuzhiyun	/* attempt to compare bytes not aligned with 8 bytes so that
160*4882a593Smuzhiyun	 * rest comparison can run based on 8 bytes alignment.
161*4882a593Smuzhiyun	 */
162*4882a593Smuzhiyun	andi.   r6,r3,7
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun	/* Try to compare the first double word which is not 8 bytes aligned:
165*4882a593Smuzhiyun	 * load the first double word at (src & ~7UL) and shift left appropriate
166*4882a593Smuzhiyun	 * bits before comparision.
167*4882a593Smuzhiyun	 */
168*4882a593Smuzhiyun	rlwinm  r6,r3,3,26,28
169*4882a593Smuzhiyun	beq     .Lsameoffset_8bytes_aligned
170*4882a593Smuzhiyun	clrrdi	r3,r3,3
171*4882a593Smuzhiyun	clrrdi	r4,r4,3
172*4882a593Smuzhiyun	LD	rA,0,r3
173*4882a593Smuzhiyun	LD	rB,0,r4
174*4882a593Smuzhiyun	sld	rA,rA,r6
175*4882a593Smuzhiyun	sld	rB,rB,r6
176*4882a593Smuzhiyun	cmpld	cr0,rA,rB
177*4882a593Smuzhiyun	srwi	r6,r6,3
178*4882a593Smuzhiyun	bne	cr0,.LcmpAB_lightweight
179*4882a593Smuzhiyun	subfic  r6,r6,8
180*4882a593Smuzhiyun	subf.	r5,r6,r5
181*4882a593Smuzhiyun	addi	r3,r3,8
182*4882a593Smuzhiyun	addi	r4,r4,8
183*4882a593Smuzhiyun	beq	.Lzero
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun.Lsameoffset_8bytes_aligned:
186*4882a593Smuzhiyun	/* now we are aligned with 8 bytes.
187*4882a593Smuzhiyun	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
188*4882a593Smuzhiyun	 */
189*4882a593Smuzhiyun	cmpdi   cr6,r5,31
190*4882a593Smuzhiyun	bgt	cr6,.Llong
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun.Lcmp_lt32bytes:
193*4882a593Smuzhiyun	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194*4882a593Smuzhiyun	cmpdi   cr5,r5,7
195*4882a593Smuzhiyun	srdi    r0,r5,3
196*4882a593Smuzhiyun	ble	cr5,.Lcmp_rest_lt8bytes
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun	/* handle 8 ~ 31 bytes */
199*4882a593Smuzhiyun	clrldi  r5,r5,61
200*4882a593Smuzhiyun	mtctr   r0
201*4882a593Smuzhiyun2:
202*4882a593Smuzhiyun	LD	rA,0,r3
203*4882a593Smuzhiyun	LD	rB,0,r4
204*4882a593Smuzhiyun	cmpld	cr0,rA,rB
205*4882a593Smuzhiyun	addi	r3,r3,8
206*4882a593Smuzhiyun	addi	r4,r4,8
207*4882a593Smuzhiyun	bne	cr0,.LcmpAB_lightweight
208*4882a593Smuzhiyun	bdnz	2b
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun	cmpwi   r5,0
211*4882a593Smuzhiyun	beq	.Lzero
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun.Lcmp_rest_lt8bytes:
214*4882a593Smuzhiyun	/*
215*4882a593Smuzhiyun	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
216*4882a593Smuzhiyun	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217*4882a593Smuzhiyun	 * page boundary, otherwise we might read past the end of the buffer and
218*4882a593Smuzhiyun	 * trigger a page fault. We use 4K as the conservative minimum page
219*4882a593Smuzhiyun	 * size. If we detect that case we go to the byte-by-byte loop.
220*4882a593Smuzhiyun	 *
221*4882a593Smuzhiyun	 * Otherwise the next double word is loaded from s1 and s2, and shifted
222*4882a593Smuzhiyun	 * right to compare the appropriate bits.
223*4882a593Smuzhiyun	 */
224*4882a593Smuzhiyun	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
225*4882a593Smuzhiyun	cmpdi	r6,0xff8
226*4882a593Smuzhiyun	bgt	.Lshort
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun	subfic  r6,r5,8
229*4882a593Smuzhiyun	slwi	r6,r6,3
230*4882a593Smuzhiyun	LD	rA,0,r3
231*4882a593Smuzhiyun	LD	rB,0,r4
232*4882a593Smuzhiyun	srd	rA,rA,r6
233*4882a593Smuzhiyun	srd	rB,rB,r6
234*4882a593Smuzhiyun	cmpld	cr0,rA,rB
235*4882a593Smuzhiyun	bne	cr0,.LcmpAB_lightweight
236*4882a593Smuzhiyun	b	.Lzero
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun.Lnon_zero:
239*4882a593Smuzhiyun	mr	r3,rC
240*4882a593Smuzhiyun	blr
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun.Llong:
243*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
244*4882a593SmuzhiyunBEGIN_FTR_SECTION
245*4882a593Smuzhiyun	/* Try to use vmx loop if length is equal or greater than 4K */
246*4882a593Smuzhiyun	cmpldi  cr6,r5,VMX_THRESH
247*4882a593Smuzhiyun	bge	cr6,.Lsameoffset_vmx_cmp
248*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun.Llong_novmx_cmp:
251*4882a593Smuzhiyun#endif
252*4882a593Smuzhiyun	/* At least s1 addr is aligned with 8 bytes */
253*4882a593Smuzhiyun	li	off8,8
254*4882a593Smuzhiyun	li	off16,16
255*4882a593Smuzhiyun	li	off24,24
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun	std	r31,-8(r1)
258*4882a593Smuzhiyun	std	r30,-16(r1)
259*4882a593Smuzhiyun	std	r29,-24(r1)
260*4882a593Smuzhiyun	std	r28,-32(r1)
261*4882a593Smuzhiyun	std	r27,-40(r1)
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun	srdi	r0,r5,5
264*4882a593Smuzhiyun	mtctr	r0
265*4882a593Smuzhiyun	andi.	r5,r5,31
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	LD	rA,0,r3
268*4882a593Smuzhiyun	LD	rB,0,r4
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun	LD	rC,off8,r3
271*4882a593Smuzhiyun	LD	rD,off8,r4
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun	LD	rE,off16,r3
274*4882a593Smuzhiyun	LD	rF,off16,r4
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun	LD	rG,off24,r3
277*4882a593Smuzhiyun	LD	rH,off24,r4
278*4882a593Smuzhiyun	cmpld	cr0,rA,rB
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun	addi	r3,r3,32
281*4882a593Smuzhiyun	addi	r4,r4,32
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun	bdz	.Lfirst32
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	LD	rA,0,r3
286*4882a593Smuzhiyun	LD	rB,0,r4
287*4882a593Smuzhiyun	cmpld	cr1,rC,rD
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun	LD	rC,off8,r3
290*4882a593Smuzhiyun	LD	rD,off8,r4
291*4882a593Smuzhiyun	cmpld	cr6,rE,rF
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun	LD	rE,off16,r3
294*4882a593Smuzhiyun	LD	rF,off16,r4
295*4882a593Smuzhiyun	cmpld	cr7,rG,rH
296*4882a593Smuzhiyun	bne	cr0,.LcmpAB
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun	LD	rG,off24,r3
299*4882a593Smuzhiyun	LD	rH,off24,r4
300*4882a593Smuzhiyun	cmpld	cr0,rA,rB
301*4882a593Smuzhiyun	bne	cr1,.LcmpCD
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun	addi	r3,r3,32
304*4882a593Smuzhiyun	addi	r4,r4,32
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun	bdz	.Lsecond32
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun	.balign	16
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun1:	LD	rA,0,r3
311*4882a593Smuzhiyun	LD	rB,0,r4
312*4882a593Smuzhiyun	cmpld	cr1,rC,rD
313*4882a593Smuzhiyun	bne	cr6,.LcmpEF
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	LD	rC,off8,r3
316*4882a593Smuzhiyun	LD	rD,off8,r4
317*4882a593Smuzhiyun	cmpld	cr6,rE,rF
318*4882a593Smuzhiyun	bne	cr7,.LcmpGH
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun	LD	rE,off16,r3
321*4882a593Smuzhiyun	LD	rF,off16,r4
322*4882a593Smuzhiyun	cmpld	cr7,rG,rH
323*4882a593Smuzhiyun	bne	cr0,.LcmpAB
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun	LD	rG,off24,r3
326*4882a593Smuzhiyun	LD	rH,off24,r4
327*4882a593Smuzhiyun	cmpld	cr0,rA,rB
328*4882a593Smuzhiyun	bne	cr1,.LcmpCD
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun	addi	r3,r3,32
331*4882a593Smuzhiyun	addi	r4,r4,32
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun	bdnz	1b
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun.Lsecond32:
336*4882a593Smuzhiyun	cmpld	cr1,rC,rD
337*4882a593Smuzhiyun	bne	cr6,.LcmpEF
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun	cmpld	cr6,rE,rF
340*4882a593Smuzhiyun	bne	cr7,.LcmpGH
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun	cmpld	cr7,rG,rH
343*4882a593Smuzhiyun	bne	cr0,.LcmpAB
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun	bne	cr1,.LcmpCD
346*4882a593Smuzhiyun	bne	cr6,.LcmpEF
347*4882a593Smuzhiyun	bne	cr7,.LcmpGH
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun.Ltail:
350*4882a593Smuzhiyun	ld	r31,-8(r1)
351*4882a593Smuzhiyun	ld	r30,-16(r1)
352*4882a593Smuzhiyun	ld	r29,-24(r1)
353*4882a593Smuzhiyun	ld	r28,-32(r1)
354*4882a593Smuzhiyun	ld	r27,-40(r1)
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun	cmpdi	r5,0
357*4882a593Smuzhiyun	beq	.Lzero
358*4882a593Smuzhiyun	b	.Lshort
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun.Lfirst32:
361*4882a593Smuzhiyun	cmpld	cr1,rC,rD
362*4882a593Smuzhiyun	cmpld	cr6,rE,rF
363*4882a593Smuzhiyun	cmpld	cr7,rG,rH
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun	bne	cr0,.LcmpAB
366*4882a593Smuzhiyun	bne	cr1,.LcmpCD
367*4882a593Smuzhiyun	bne	cr6,.LcmpEF
368*4882a593Smuzhiyun	bne	cr7,.LcmpGH
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun	b	.Ltail
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun.LcmpAB:
373*4882a593Smuzhiyun	li	r3,1
374*4882a593Smuzhiyun	bgt	cr0,.Lout
375*4882a593Smuzhiyun	li	r3,-1
376*4882a593Smuzhiyun	b	.Lout
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun.LcmpCD:
379*4882a593Smuzhiyun	li	r3,1
380*4882a593Smuzhiyun	bgt	cr1,.Lout
381*4882a593Smuzhiyun	li	r3,-1
382*4882a593Smuzhiyun	b	.Lout
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun.LcmpEF:
385*4882a593Smuzhiyun	li	r3,1
386*4882a593Smuzhiyun	bgt	cr6,.Lout
387*4882a593Smuzhiyun	li	r3,-1
388*4882a593Smuzhiyun	b	.Lout
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun.LcmpGH:
391*4882a593Smuzhiyun	li	r3,1
392*4882a593Smuzhiyun	bgt	cr7,.Lout
393*4882a593Smuzhiyun	li	r3,-1
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun.Lout:
396*4882a593Smuzhiyun	ld	r31,-8(r1)
397*4882a593Smuzhiyun	ld	r30,-16(r1)
398*4882a593Smuzhiyun	ld	r29,-24(r1)
399*4882a593Smuzhiyun	ld	r28,-32(r1)
400*4882a593Smuzhiyun	ld	r27,-40(r1)
401*4882a593Smuzhiyun	blr
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun.LcmpAB_lightweight:   /* skip NV GPRS restore */
404*4882a593Smuzhiyun	li	r3,1
405*4882a593Smuzhiyun	bgtlr
406*4882a593Smuzhiyun	li	r3,-1
407*4882a593Smuzhiyun	blr
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
410*4882a593Smuzhiyun.Lsameoffset_vmx_cmp:
411*4882a593Smuzhiyun	/* Enter with src/dst addrs has the same offset with 8 bytes
412*4882a593Smuzhiyun	 * align boundary.
413*4882a593Smuzhiyun	 *
414*4882a593Smuzhiyun	 * There is an optimization based on following fact: memcmp()
415*4882a593Smuzhiyun	 * prones to fail early at the first 32 bytes.
416*4882a593Smuzhiyun	 * Before applying VMX instructions which will lead to 32x128bits
417*4882a593Smuzhiyun	 * VMX regs load/restore penalty, we compare the first 32 bytes
418*4882a593Smuzhiyun	 * so that we can catch the ~80% fail cases.
419*4882a593Smuzhiyun	 */
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun	li	r0,4
422*4882a593Smuzhiyun	mtctr	r0
423*4882a593Smuzhiyun.Lsameoffset_prechk_32B_loop:
424*4882a593Smuzhiyun	LD	rA,0,r3
425*4882a593Smuzhiyun	LD	rB,0,r4
426*4882a593Smuzhiyun	cmpld	cr0,rA,rB
427*4882a593Smuzhiyun	addi	r3,r3,8
428*4882a593Smuzhiyun	addi	r4,r4,8
429*4882a593Smuzhiyun	bne     cr0,.LcmpAB_lightweight
430*4882a593Smuzhiyun	addi	r5,r5,-8
431*4882a593Smuzhiyun	bdnz	.Lsameoffset_prechk_32B_loop
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun	ENTER_VMX_OPS
434*4882a593Smuzhiyun	beq     cr1,.Llong_novmx_cmp
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun3:
437*4882a593Smuzhiyun	/* need to check whether r4 has the same offset with r3
438*4882a593Smuzhiyun	 * for 16 bytes boundary.
439*4882a593Smuzhiyun	 */
440*4882a593Smuzhiyun	xor	r0,r3,r4
441*4882a593Smuzhiyun	andi.	r0,r0,0xf
442*4882a593Smuzhiyun	bne	.Ldiffoffset_vmx_cmp_start
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun	/* len is no less than 4KB. Need to align with 16 bytes further.
445*4882a593Smuzhiyun	 */
446*4882a593Smuzhiyun	andi.	rA,r3,8
447*4882a593Smuzhiyun	LD	rA,0,r3
448*4882a593Smuzhiyun	beq	4f
449*4882a593Smuzhiyun	LD	rB,0,r4
450*4882a593Smuzhiyun	cmpld	cr0,rA,rB
451*4882a593Smuzhiyun	addi	r3,r3,8
452*4882a593Smuzhiyun	addi	r4,r4,8
453*4882a593Smuzhiyun	addi	r5,r5,-8
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun	beq	cr0,4f
456*4882a593Smuzhiyun	/* save and restore cr0 */
457*4882a593Smuzhiyun	mfocrf  r5,128
458*4882a593Smuzhiyun	EXIT_VMX_OPS
459*4882a593Smuzhiyun	mtocrf  128,r5
460*4882a593Smuzhiyun	b	.LcmpAB_lightweight
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun4:
463*4882a593Smuzhiyun	/* compare 32 bytes for each loop */
464*4882a593Smuzhiyun	srdi	r0,r5,5
465*4882a593Smuzhiyun	mtctr	r0
466*4882a593Smuzhiyun	clrldi  r5,r5,59
467*4882a593Smuzhiyun	li	off16,16
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun.balign 16
470*4882a593Smuzhiyun5:
471*4882a593Smuzhiyun	lvx 	v0,0,r3
472*4882a593Smuzhiyun	lvx 	v1,0,r4
473*4882a593Smuzhiyun	VCMPEQUD_RC(v0,v0,v1)
474*4882a593Smuzhiyun	bnl	cr6,7f
475*4882a593Smuzhiyun	lvx 	v0,off16,r3
476*4882a593Smuzhiyun	lvx 	v1,off16,r4
477*4882a593Smuzhiyun	VCMPEQUD_RC(v0,v0,v1)
478*4882a593Smuzhiyun	bnl	cr6,6f
479*4882a593Smuzhiyun	addi	r3,r3,32
480*4882a593Smuzhiyun	addi	r4,r4,32
481*4882a593Smuzhiyun	bdnz	5b
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun	EXIT_VMX_OPS
484*4882a593Smuzhiyun	cmpdi	r5,0
485*4882a593Smuzhiyun	beq	.Lzero
486*4882a593Smuzhiyun	b	.Lcmp_lt32bytes
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun6:
489*4882a593Smuzhiyun	addi	r3,r3,16
490*4882a593Smuzhiyun	addi	r4,r4,16
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun7:
493*4882a593Smuzhiyun	/* diff the last 16 bytes */
494*4882a593Smuzhiyun	EXIT_VMX_OPS
495*4882a593Smuzhiyun	LD	rA,0,r3
496*4882a593Smuzhiyun	LD	rB,0,r4
497*4882a593Smuzhiyun	cmpld	cr0,rA,rB
498*4882a593Smuzhiyun	li	off8,8
499*4882a593Smuzhiyun	bne	cr0,.LcmpAB_lightweight
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun	LD	rA,off8,r3
502*4882a593Smuzhiyun	LD	rB,off8,r4
503*4882a593Smuzhiyun	cmpld	cr0,rA,rB
504*4882a593Smuzhiyun	bne	cr0,.LcmpAB_lightweight
505*4882a593Smuzhiyun	b	.Lzero
506*4882a593Smuzhiyun#endif
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun.Ldiffoffset_8bytes_make_align_start:
509*4882a593Smuzhiyun	/* now try to align s1 with 8 bytes */
510*4882a593Smuzhiyun	rlwinm  r6,r3,3,26,28
511*4882a593Smuzhiyun	beq     .Ldiffoffset_align_s1_8bytes
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun	clrrdi	r3,r3,3
514*4882a593Smuzhiyun	LD	rA,0,r3
515*4882a593Smuzhiyun	LD	rB,0,r4  /* unaligned load */
516*4882a593Smuzhiyun	sld	rA,rA,r6
517*4882a593Smuzhiyun	srd	rA,rA,r6
518*4882a593Smuzhiyun	srd	rB,rB,r6
519*4882a593Smuzhiyun	cmpld	cr0,rA,rB
520*4882a593Smuzhiyun	srwi	r6,r6,3
521*4882a593Smuzhiyun	bne	cr0,.LcmpAB_lightweight
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun	subfic  r6,r6,8
524*4882a593Smuzhiyun	subf.	r5,r6,r5
525*4882a593Smuzhiyun	addi	r3,r3,8
526*4882a593Smuzhiyun	add	r4,r4,r6
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun	beq	.Lzero
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun.Ldiffoffset_align_s1_8bytes:
531*4882a593Smuzhiyun	/* now s1 is aligned with 8 bytes. */
532*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
533*4882a593SmuzhiyunBEGIN_FTR_SECTION
534*4882a593Smuzhiyun	/* only do vmx ops when the size equal or greater than 4K bytes */
535*4882a593Smuzhiyun	cmpdi	cr5,r5,VMX_THRESH
536*4882a593Smuzhiyun	bge	cr5,.Ldiffoffset_vmx_cmp
537*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun.Ldiffoffset_novmx_cmp:
540*4882a593Smuzhiyun#endif
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun	cmpdi   cr5,r5,31
544*4882a593Smuzhiyun	ble	cr5,.Lcmp_lt32bytes
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
547*4882a593Smuzhiyun	b	.Llong_novmx_cmp
548*4882a593Smuzhiyun#else
549*4882a593Smuzhiyun	b	.Llong
550*4882a593Smuzhiyun#endif
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC
553*4882a593Smuzhiyun.Ldiffoffset_vmx_cmp:
554*4882a593Smuzhiyun	/* perform a 32 bytes pre-checking before
555*4882a593Smuzhiyun	 * enable VMX operations.
556*4882a593Smuzhiyun	 */
557*4882a593Smuzhiyun	li	r0,4
558*4882a593Smuzhiyun	mtctr	r0
559*4882a593Smuzhiyun.Ldiffoffset_prechk_32B_loop:
560*4882a593Smuzhiyun	LD	rA,0,r3
561*4882a593Smuzhiyun	LD	rB,0,r4
562*4882a593Smuzhiyun	cmpld	cr0,rA,rB
563*4882a593Smuzhiyun	addi	r3,r3,8
564*4882a593Smuzhiyun	addi	r4,r4,8
565*4882a593Smuzhiyun	bne     cr0,.LcmpAB_lightweight
566*4882a593Smuzhiyun	addi	r5,r5,-8
567*4882a593Smuzhiyun	bdnz	.Ldiffoffset_prechk_32B_loop
568*4882a593Smuzhiyun
569*4882a593Smuzhiyun	ENTER_VMX_OPS
570*4882a593Smuzhiyun	beq     cr1,.Ldiffoffset_novmx_cmp
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun.Ldiffoffset_vmx_cmp_start:
573*4882a593Smuzhiyun	/* Firstly try to align r3 with 16 bytes */
574*4882a593Smuzhiyun	andi.   r6,r3,0xf
575*4882a593Smuzhiyun	li	off16,16
576*4882a593Smuzhiyun	beq     .Ldiffoffset_vmx_s1_16bytes_align
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun	LVS	v3,0,r3
579*4882a593Smuzhiyun	LVS	v4,0,r4
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun	lvx     v5,0,r3
582*4882a593Smuzhiyun	lvx     v6,0,r4
583*4882a593Smuzhiyun	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584*4882a593Smuzhiyun	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585*4882a593Smuzhiyun
586*4882a593Smuzhiyun	VCMPEQUB_RC(v7,v9,v10)
587*4882a593Smuzhiyun	bnl	cr6,.Ldiffoffset_vmx_diff_found
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun	subfic  r6,r6,16
590*4882a593Smuzhiyun	subf    r5,r6,r5
591*4882a593Smuzhiyun	add     r3,r3,r6
592*4882a593Smuzhiyun	add     r4,r4,r6
593*4882a593Smuzhiyun
594*4882a593Smuzhiyun.Ldiffoffset_vmx_s1_16bytes_align:
595*4882a593Smuzhiyun	/* now s1 is aligned with 16 bytes */
596*4882a593Smuzhiyun	lvx     v6,0,r4
597*4882a593Smuzhiyun	LVS	v4,0,r4
598*4882a593Smuzhiyun	srdi	r6,r5,5  /* loop for 32 bytes each */
599*4882a593Smuzhiyun	clrldi  r5,r5,59
600*4882a593Smuzhiyun	mtctr	r6
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun.balign	16
603*4882a593Smuzhiyun.Ldiffoffset_vmx_32bytesloop:
604*4882a593Smuzhiyun	/* the first qw of r4 was saved in v6 */
605*4882a593Smuzhiyun	lvx	v9,0,r3
606*4882a593Smuzhiyun	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607*4882a593Smuzhiyun	VCMPEQUB_RC(v7,v9,v10)
608*4882a593Smuzhiyun	vor	v6,v8,v8
609*4882a593Smuzhiyun	bnl	cr6,.Ldiffoffset_vmx_diff_found
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun	addi	r3,r3,16
612*4882a593Smuzhiyun	addi	r4,r4,16
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun	lvx	v9,0,r3
615*4882a593Smuzhiyun	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616*4882a593Smuzhiyun	VCMPEQUB_RC(v7,v9,v10)
617*4882a593Smuzhiyun	vor	v6,v8,v8
618*4882a593Smuzhiyun	bnl	cr6,.Ldiffoffset_vmx_diff_found
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun	addi	r3,r3,16
621*4882a593Smuzhiyun	addi	r4,r4,16
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun	bdnz	.Ldiffoffset_vmx_32bytesloop
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun	EXIT_VMX_OPS
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun	cmpdi	r5,0
628*4882a593Smuzhiyun	beq	.Lzero
629*4882a593Smuzhiyun	b	.Lcmp_lt32bytes
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun.Ldiffoffset_vmx_diff_found:
632*4882a593Smuzhiyun	EXIT_VMX_OPS
633*4882a593Smuzhiyun	/* anyway, the diff will appear in next 16 bytes */
634*4882a593Smuzhiyun	li	r5,16
635*4882a593Smuzhiyun	b	.Lcmp_lt32bytes
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun#endif
638*4882a593SmuzhiyunEXPORT_SYMBOL(memcmp)
639