xref: /OK3568_Linux_fs/kernel/arch/sparc/lib/csum_copy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/* csum_copy.S: Checksum+copy code for sparc64
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun#include <asm/export.h>
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun#ifdef __KERNEL__
10*4882a593Smuzhiyun#define GLOBAL_SPARE	%g7
11*4882a593Smuzhiyun#else
12*4882a593Smuzhiyun#define GLOBAL_SPARE	%g5
13*4882a593Smuzhiyun#endif
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun#ifndef EX_LD
16*4882a593Smuzhiyun#define EX_LD(x)	x
17*4882a593Smuzhiyun#endif
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun#ifndef EX_ST
20*4882a593Smuzhiyun#define EX_ST(x)	x
21*4882a593Smuzhiyun#endif
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun#ifndef EX_RETVAL
24*4882a593Smuzhiyun#define EX_RETVAL(x)	x
25*4882a593Smuzhiyun#endif
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun#ifndef LOAD
28*4882a593Smuzhiyun#define LOAD(type,addr,dest)	type [addr], dest
29*4882a593Smuzhiyun#endif
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun#ifndef STORE
32*4882a593Smuzhiyun#define STORE(type,src,addr)	type src, [addr]
33*4882a593Smuzhiyun#endif
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#ifndef FUNC_NAME
36*4882a593Smuzhiyun#define FUNC_NAME	csum_partial_copy_nocheck
37*4882a593Smuzhiyun#endif
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun	.register	%g2, #scratch
40*4882a593Smuzhiyun	.register	%g3, #scratch
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	.text
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun90:
45*4882a593Smuzhiyun	/* We checked for zero length already, so there must be
46*4882a593Smuzhiyun	 * at least one byte.
47*4882a593Smuzhiyun	 */
48*4882a593Smuzhiyun	be,pt		%icc, 1f
49*4882a593Smuzhiyun	 nop
50*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o0 + 0x00, %o4))
51*4882a593Smuzhiyun	add		%o0, 1, %o0
52*4882a593Smuzhiyun	sub		%o2, 1, %o2
53*4882a593Smuzhiyun	EX_ST(STORE(stb, %o4, %o1 + 0x00))
54*4882a593Smuzhiyun	add		%o1, 1, %o1
55*4882a593Smuzhiyun1:	andcc		%o0, 0x2, %g0
56*4882a593Smuzhiyun	be,pn		%icc, 80f
57*4882a593Smuzhiyun	 cmp		%o2, 2
58*4882a593Smuzhiyun	blu,pn		%icc, 60f
59*4882a593Smuzhiyun	 nop
60*4882a593Smuzhiyun	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
61*4882a593Smuzhiyun	add		%o0, 2, %o0
62*4882a593Smuzhiyun	sub		%o2, 2, %o2
63*4882a593Smuzhiyun	EX_ST(STORE(sth, %o5, %o1 + 0x00))
64*4882a593Smuzhiyun	add		%o1, 2, %o1
65*4882a593Smuzhiyun	ba,pt		%xcc, 80f
66*4882a593Smuzhiyun	 add		%o5, %o4, %o4
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun	.globl		FUNC_NAME
69*4882a593Smuzhiyun	.type		FUNC_NAME,#function
70*4882a593Smuzhiyun	EXPORT_SYMBOL(FUNC_NAME)
71*4882a593SmuzhiyunFUNC_NAME:		/* %o0=src, %o1=dst, %o2=len */
72*4882a593Smuzhiyun	LOAD(prefetch, %o0 + 0x000, #n_reads)
73*4882a593Smuzhiyun	xor		%o0, %o1, %g1
74*4882a593Smuzhiyun	mov		-1, %o3
75*4882a593Smuzhiyun	clr		%o4
76*4882a593Smuzhiyun	andcc		%g1, 0x3, %g0
77*4882a593Smuzhiyun	bne,pn		%icc, 95f
78*4882a593Smuzhiyun	 LOAD(prefetch, %o0 + 0x040, #n_reads)
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	brz,pn		%o2, 70f
81*4882a593Smuzhiyun	 andcc		%o0, 0x3, %g0
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	/* We "remember" whether the lowest bit in the address
84*4882a593Smuzhiyun	 * was set in GLOBAL_SPARE.  Because if it is, we have to swap
85*4882a593Smuzhiyun	 * upper and lower 8 bit fields of the sum we calculate.
86*4882a593Smuzhiyun	*/
87*4882a593Smuzhiyun	bne,pn		%icc, 90b
88*4882a593Smuzhiyun	 andcc		%o0, 0x1, GLOBAL_SPARE
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun80:
91*4882a593Smuzhiyun	LOAD(prefetch, %o0 + 0x080, #n_reads)
92*4882a593Smuzhiyun	andncc		%o2, 0x3f, %g3
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun	LOAD(prefetch, %o0 + 0x0c0, #n_reads)
95*4882a593Smuzhiyun	sub		%o2, %g3, %o2
96*4882a593Smuzhiyun	brz,pn		%g3, 2f
97*4882a593Smuzhiyun	 LOAD(prefetch, %o0 + 0x100, #n_reads)
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun	/* So that we don't need to use the non-pairing
100*4882a593Smuzhiyun	 * add-with-carry instructions we accumulate 32-bit
101*4882a593Smuzhiyun	 * values into a 64-bit register.  At the end of the
102*4882a593Smuzhiyun	 * loop we fold it down to 32-bits and so on.
103*4882a593Smuzhiyun	 */
104*4882a593Smuzhiyun	ba,pt		%xcc, 1f
105*4882a593Smuzhiyun	LOAD(prefetch, %o0 + 0x140, #n_reads)
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun	.align		32
108*4882a593Smuzhiyun1:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
109*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x04, %g1))
110*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x08, %g2))
111*4882a593Smuzhiyun	add		%o4, %o5, %o4
112*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x00))
113*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x0c, %o5))
114*4882a593Smuzhiyun	add		%o4, %g1, %o4
115*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + 0x04))
116*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x10, %g1))
117*4882a593Smuzhiyun	add		%o4, %g2, %o4
118*4882a593Smuzhiyun	EX_ST(STORE(stw, %g2, %o1 + 0x08))
119*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x14, %g2))
120*4882a593Smuzhiyun	add		%o4, %o5, %o4
121*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x0c))
122*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x18, %o5))
123*4882a593Smuzhiyun	add		%o4, %g1, %o4
124*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + 0x10))
125*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x1c, %g1))
126*4882a593Smuzhiyun	add		%o4, %g2, %o4
127*4882a593Smuzhiyun	EX_ST(STORE(stw, %g2, %o1 + 0x14))
128*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x20, %g2))
129*4882a593Smuzhiyun	add		%o4, %o5, %o4
130*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x18))
131*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x24, %o5))
132*4882a593Smuzhiyun	add		%o4, %g1, %o4
133*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + 0x1c))
134*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x28, %g1))
135*4882a593Smuzhiyun	add		%o4, %g2, %o4
136*4882a593Smuzhiyun	EX_ST(STORE(stw, %g2, %o1 + 0x20))
137*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x2c, %g2))
138*4882a593Smuzhiyun	add		%o4, %o5, %o4
139*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x24))
140*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x30, %o5))
141*4882a593Smuzhiyun	add		%o4, %g1, %o4
142*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + 0x28))
143*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x34, %g1))
144*4882a593Smuzhiyun	add		%o4, %g2, %o4
145*4882a593Smuzhiyun	EX_ST(STORE(stw, %g2, %o1 + 0x2c))
146*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x38, %g2))
147*4882a593Smuzhiyun	add		%o4, %o5, %o4
148*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x30))
149*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0 + 0x3c, %o5))
150*4882a593Smuzhiyun	add		%o4, %g1, %o4
151*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + 0x34))
152*4882a593Smuzhiyun	LOAD(prefetch, %o0 + 0x180, #n_reads)
153*4882a593Smuzhiyun	add		%o4, %g2, %o4
154*4882a593Smuzhiyun	EX_ST(STORE(stw, %g2, %o1 + 0x38))
155*4882a593Smuzhiyun	subcc		%g3, 0x40, %g3
156*4882a593Smuzhiyun	add		%o0, 0x40, %o0
157*4882a593Smuzhiyun	add		%o4, %o5, %o4
158*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x3c))
159*4882a593Smuzhiyun	bne,pt		%icc, 1b
160*4882a593Smuzhiyun	 add		%o1, 0x40, %o1
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun2:	and		%o2, 0x3c, %g3
163*4882a593Smuzhiyun	brz,pn		%g3, 2f
164*4882a593Smuzhiyun	 sub		%o2, %g3, %o2
165*4882a593Smuzhiyun1:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
166*4882a593Smuzhiyun	subcc		%g3, 0x4, %g3
167*4882a593Smuzhiyun	add		%o0, 0x4, %o0
168*4882a593Smuzhiyun	add		%o4, %o5, %o4
169*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + 0x00))
170*4882a593Smuzhiyun	bne,pt		%icc, 1b
171*4882a593Smuzhiyun	 add		%o1, 0x4, %o1
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun2:
174*4882a593Smuzhiyun	/* fold 64-->32 */
175*4882a593Smuzhiyun	srlx		%o4, 32, %o5
176*4882a593Smuzhiyun	srl		%o4, 0, %o4
177*4882a593Smuzhiyun	add		%o4, %o5, %o4
178*4882a593Smuzhiyun	srlx		%o4, 32, %o5
179*4882a593Smuzhiyun	srl		%o4, 0, %o4
180*4882a593Smuzhiyun	add		%o4, %o5, %o4
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun	/* fold 32-->16 */
183*4882a593Smuzhiyun	sethi		%hi(0xffff0000), %g1
184*4882a593Smuzhiyun	srl		%o4, 16, %o5
185*4882a593Smuzhiyun	andn		%o4, %g1, %g2
186*4882a593Smuzhiyun	add		%o5, %g2, %o4
187*4882a593Smuzhiyun	srl		%o4, 16, %o5
188*4882a593Smuzhiyun	andn		%o4, %g1, %g2
189*4882a593Smuzhiyun	add		%o5, %g2, %o4
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun60:
192*4882a593Smuzhiyun	/* %o4 has the 16-bit sum we have calculated so-far.  */
193*4882a593Smuzhiyun	cmp		%o2, 2
194*4882a593Smuzhiyun	blu,pt		%icc, 1f
195*4882a593Smuzhiyun	 nop
196*4882a593Smuzhiyun	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
197*4882a593Smuzhiyun	sub		%o2, 2, %o2
198*4882a593Smuzhiyun	add		%o0, 2, %o0
199*4882a593Smuzhiyun	add		%o4, %o5, %o4
200*4882a593Smuzhiyun	EX_ST(STORE(sth, %o5, %o1 + 0x00))
201*4882a593Smuzhiyun	add		%o1, 0x2, %o1
202*4882a593Smuzhiyun1:	brz,pt		%o2, 1f
203*4882a593Smuzhiyun	 nop
204*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o0 + 0x00, %o5))
205*4882a593Smuzhiyun	sub		%o2, 1, %o2
206*4882a593Smuzhiyun	add		%o0, 1, %o0
207*4882a593Smuzhiyun	EX_ST(STORE(stb, %o5, %o1 + 0x00))
208*4882a593Smuzhiyun	sllx		%o5, 8, %o5
209*4882a593Smuzhiyun	add		%o1, 1, %o1
210*4882a593Smuzhiyun	add		%o4, %o5, %o4
211*4882a593Smuzhiyun1:
212*4882a593Smuzhiyun	/* fold 32-->16 */
213*4882a593Smuzhiyun	sethi		%hi(0xffff0000), %g1
214*4882a593Smuzhiyun	srl		%o4, 16, %o5
215*4882a593Smuzhiyun	andn		%o4, %g1, %g2
216*4882a593Smuzhiyun	add		%o5, %g2, %o4
217*4882a593Smuzhiyun	srl		%o4, 16, %o5
218*4882a593Smuzhiyun	andn		%o4, %g1, %g2
219*4882a593Smuzhiyun	add		%o5, %g2, %o4
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun1:	brz,pt		GLOBAL_SPARE, 1f
222*4882a593Smuzhiyun	 nop
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun	/* We started with an odd byte, byte-swap the result.  */
225*4882a593Smuzhiyun	srl		%o4, 8, %o5
226*4882a593Smuzhiyun	and		%o4, 0xff, %g1
227*4882a593Smuzhiyun	sll		%g1, 8, %g1
228*4882a593Smuzhiyun	or		%o5, %g1, %o4
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun1:	addcc		%o3, %o4, %o3
231*4882a593Smuzhiyun	addc		%g0, %o3, %o3
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun70:
234*4882a593Smuzhiyun	retl
235*4882a593Smuzhiyun	 srl		%o3, 0, %o0
236*4882a593Smuzhiyun
237*4882a593Smuzhiyun95:	mov		0, GLOBAL_SPARE
238*4882a593Smuzhiyun	brlez,pn	%o2, 4f
239*4882a593Smuzhiyun	 andcc		%o0, 1, %o5
240*4882a593Smuzhiyun	be,a,pt		%icc, 1f
241*4882a593Smuzhiyun	 srl		%o2, 1, %g1
242*4882a593Smuzhiyun	sub		%o2, 1, %o2
243*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE))
244*4882a593Smuzhiyun	add		%o0, 1, %o0
245*4882a593Smuzhiyun	EX_ST(STORE(stb, GLOBAL_SPARE, %o1))
246*4882a593Smuzhiyun	srl		%o2, 1, %g1
247*4882a593Smuzhiyun	add		%o1, 1, %o1
248*4882a593Smuzhiyun1:	brz,a,pn	%g1, 3f
249*4882a593Smuzhiyun	 andcc		%o2, 1, %g0
250*4882a593Smuzhiyun	andcc		%o0, 2, %g0
251*4882a593Smuzhiyun	be,a,pt		%icc, 1f
252*4882a593Smuzhiyun	 srl		%g1, 1, %g1
253*4882a593Smuzhiyun	EX_LD(LOAD(lduh, %o0, %o4))
254*4882a593Smuzhiyun	sub		%o2, 2, %o2
255*4882a593Smuzhiyun	srl		%o4, 8, %g2
256*4882a593Smuzhiyun	sub		%g1, 1, %g1
257*4882a593Smuzhiyun	EX_ST(STORE(stb, %g2, %o1))
258*4882a593Smuzhiyun	add		%o4, GLOBAL_SPARE, GLOBAL_SPARE
259*4882a593Smuzhiyun	EX_ST(STORE(stb, %o4, %o1 + 1))
260*4882a593Smuzhiyun	add		%o0, 2, %o0
261*4882a593Smuzhiyun	srl		%g1, 1, %g1
262*4882a593Smuzhiyun	add		%o1, 2, %o1
263*4882a593Smuzhiyun1:	brz,a,pn	%g1, 2f
264*4882a593Smuzhiyun	 andcc		%o2, 2, %g0
265*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o0, %o4))
266*4882a593Smuzhiyun5:	srl		%o4, 24, %g2
267*4882a593Smuzhiyun	srl		%o4, 16, %g3
268*4882a593Smuzhiyun	EX_ST(STORE(stb, %g2, %o1))
269*4882a593Smuzhiyun	srl		%o4, 8, %g2
270*4882a593Smuzhiyun	EX_ST(STORE(stb, %g3, %o1 + 1))
271*4882a593Smuzhiyun	add		%o0, 4, %o0
272*4882a593Smuzhiyun	EX_ST(STORE(stb, %g2, %o1 + 2))
273*4882a593Smuzhiyun	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
274*4882a593Smuzhiyun	EX_ST(STORE(stb, %o4, %o1 + 3))
275*4882a593Smuzhiyun	addc		GLOBAL_SPARE, %g0, GLOBAL_SPARE
276*4882a593Smuzhiyun	add		%o1, 4, %o1
277*4882a593Smuzhiyun	subcc		%g1, 1, %g1
278*4882a593Smuzhiyun	bne,a,pt	%icc, 5b
279*4882a593Smuzhiyun	 EX_LD(LOAD(lduw, %o0, %o4))
280*4882a593Smuzhiyun	sll		GLOBAL_SPARE, 16, %g2
281*4882a593Smuzhiyun	srl		GLOBAL_SPARE, 16, GLOBAL_SPARE
282*4882a593Smuzhiyun	srl		%g2, 16, %g2
283*4882a593Smuzhiyun	andcc		%o2, 2, %g0
284*4882a593Smuzhiyun	add		%g2, GLOBAL_SPARE, GLOBAL_SPARE
285*4882a593Smuzhiyun2:	be,a,pt		%icc, 3f
286*4882a593Smuzhiyun	 andcc		%o2, 1, %g0
287*4882a593Smuzhiyun	EX_LD(LOAD(lduh, %o0, %o4))
288*4882a593Smuzhiyun	andcc		%o2, 1, %g0
289*4882a593Smuzhiyun	srl		%o4, 8, %g2
290*4882a593Smuzhiyun	add		%o0, 2, %o0
291*4882a593Smuzhiyun	EX_ST(STORE(stb, %g2, %o1))
292*4882a593Smuzhiyun	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
293*4882a593Smuzhiyun	EX_ST(STORE(stb, %o4, %o1 + 1))
294*4882a593Smuzhiyun	add		%o1, 2, %o1
295*4882a593Smuzhiyun3:	be,a,pt		%icc, 1f
296*4882a593Smuzhiyun	 sll		GLOBAL_SPARE, 16, %o4
297*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o0, %g2))
298*4882a593Smuzhiyun	sll		%g2, 8, %o4
299*4882a593Smuzhiyun	EX_ST(STORE(stb, %g2, %o1))
300*4882a593Smuzhiyun	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
301*4882a593Smuzhiyun	sll		GLOBAL_SPARE, 16, %o4
302*4882a593Smuzhiyun1:	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
303*4882a593Smuzhiyun	srl		GLOBAL_SPARE, 16, %o4
304*4882a593Smuzhiyun	addc		%g0, %o4, GLOBAL_SPARE
305*4882a593Smuzhiyun	brz,pt		%o5, 4f
306*4882a593Smuzhiyun	 srl		GLOBAL_SPARE, 8, %o4
307*4882a593Smuzhiyun	and		GLOBAL_SPARE, 0xff, %g2
308*4882a593Smuzhiyun	and		%o4, 0xff, %o4
309*4882a593Smuzhiyun	sll		%g2, 8, %g2
310*4882a593Smuzhiyun	or		%g2, %o4, GLOBAL_SPARE
311*4882a593Smuzhiyun4:	addcc		%o3, GLOBAL_SPARE, %o3
312*4882a593Smuzhiyun	addc		%g0, %o3, %o0
313*4882a593Smuzhiyun	retl
314*4882a593Smuzhiyun	 srl		%o0, 0, %o0
315*4882a593Smuzhiyun	.size		FUNC_NAME, .-FUNC_NAME
316