xref: /OK3568_Linux_fs/kernel/arch/powerpc/lib/checksum_32.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * This file contains assembly-language implementations
4*4882a593Smuzhiyun * of IP-style 1's complement checksum routines.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun#include <linux/sys.h>
12*4882a593Smuzhiyun#include <asm/processor.h>
13*4882a593Smuzhiyun#include <asm/cache.h>
14*4882a593Smuzhiyun#include <asm/errno.h>
15*4882a593Smuzhiyun#include <asm/ppc_asm.h>
16*4882a593Smuzhiyun#include <asm/export.h>
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun	.text
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun/*
21*4882a593Smuzhiyun * computes the checksum of a memory block at buff, length len,
22*4882a593Smuzhiyun * and adds in "sum" (32-bit)
23*4882a593Smuzhiyun *
24*4882a593Smuzhiyun * __csum_partial(buff, len, sum)
25*4882a593Smuzhiyun */
26*4882a593Smuzhiyun_GLOBAL(__csum_partial)
27*4882a593Smuzhiyun	subi	r3,r3,4
28*4882a593Smuzhiyun	srawi.	r6,r4,2		/* Divide len by 4 and also clear carry */
29*4882a593Smuzhiyun	beq	3f		/* if we're doing < 4 bytes */
30*4882a593Smuzhiyun	andi.	r0,r3,2		/* Align buffer to longword boundary */
31*4882a593Smuzhiyun	beq+	1f
32*4882a593Smuzhiyun	lhz	r0,4(r3)	/* do 2 bytes to get aligned */
33*4882a593Smuzhiyun	subi	r4,r4,2
34*4882a593Smuzhiyun	addi	r3,r3,2
35*4882a593Smuzhiyun	srwi.	r6,r4,2		/* # words to do */
36*4882a593Smuzhiyun	adde	r5,r5,r0
37*4882a593Smuzhiyun	beq	3f
38*4882a593Smuzhiyun1:	andi.	r6,r6,3		/* Prepare to handle words 4 by 4 */
39*4882a593Smuzhiyun	beq	21f
40*4882a593Smuzhiyun	mtctr	r6
41*4882a593Smuzhiyun2:	lwzu	r0,4(r3)
42*4882a593Smuzhiyun	adde	r5,r5,r0
43*4882a593Smuzhiyun	bdnz	2b
44*4882a593Smuzhiyun21:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
45*4882a593Smuzhiyun	beq	3f
46*4882a593Smuzhiyun	lwz	r0,4(r3)
47*4882a593Smuzhiyun	mtctr	r6
48*4882a593Smuzhiyun	lwz	r6,8(r3)
49*4882a593Smuzhiyun	adde	r5,r5,r0
50*4882a593Smuzhiyun	lwz	r7,12(r3)
51*4882a593Smuzhiyun	adde	r5,r5,r6
52*4882a593Smuzhiyun	lwzu	r8,16(r3)
53*4882a593Smuzhiyun	adde	r5,r5,r7
54*4882a593Smuzhiyun	bdz	23f
55*4882a593Smuzhiyun22:	lwz	r0,4(r3)
56*4882a593Smuzhiyun	adde	r5,r5,r8
57*4882a593Smuzhiyun	lwz	r6,8(r3)
58*4882a593Smuzhiyun	adde	r5,r5,r0
59*4882a593Smuzhiyun	lwz	r7,12(r3)
60*4882a593Smuzhiyun	adde	r5,r5,r6
61*4882a593Smuzhiyun	lwzu	r8,16(r3)
62*4882a593Smuzhiyun	adde	r5,r5,r7
63*4882a593Smuzhiyun	bdnz	22b
64*4882a593Smuzhiyun23:	adde	r5,r5,r8
65*4882a593Smuzhiyun3:	andi.	r0,r4,2
66*4882a593Smuzhiyun	beq+	4f
67*4882a593Smuzhiyun	lhz	r0,4(r3)
68*4882a593Smuzhiyun	addi	r3,r3,2
69*4882a593Smuzhiyun	adde	r5,r5,r0
70*4882a593Smuzhiyun4:	andi.	r0,r4,1
71*4882a593Smuzhiyun	beq+	5f
72*4882a593Smuzhiyun	lbz	r0,4(r3)
73*4882a593Smuzhiyun	slwi	r0,r0,8		/* Upper byte of word */
74*4882a593Smuzhiyun	adde	r5,r5,r0
75*4882a593Smuzhiyun5:	addze	r3,r5		/* add in final carry */
76*4882a593Smuzhiyun	blr
77*4882a593SmuzhiyunEXPORT_SYMBOL(__csum_partial)
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun/*
80*4882a593Smuzhiyun * Computes the checksum of a memory block at src, length len,
81*4882a593Smuzhiyun * and adds in 0xffffffff, while copying the block to dst.
82*4882a593Smuzhiyun * If an access exception occurs it returns zero.
83*4882a593Smuzhiyun *
84*4882a593Smuzhiyun * csum_partial_copy_generic(src, dst, len)
85*4882a593Smuzhiyun */
86*4882a593Smuzhiyun#define CSUM_COPY_16_BYTES_WITHEX(n)	\
87*4882a593Smuzhiyun8 ## n ## 0:			\
88*4882a593Smuzhiyun	lwz	r7,4(r4);	\
89*4882a593Smuzhiyun8 ## n ## 1:			\
90*4882a593Smuzhiyun	lwz	r8,8(r4);	\
91*4882a593Smuzhiyun8 ## n ## 2:			\
92*4882a593Smuzhiyun	lwz	r9,12(r4);	\
93*4882a593Smuzhiyun8 ## n ## 3:			\
94*4882a593Smuzhiyun	lwzu	r10,16(r4);	\
95*4882a593Smuzhiyun8 ## n ## 4:			\
96*4882a593Smuzhiyun	stw	r7,4(r6);	\
97*4882a593Smuzhiyun	adde	r12,r12,r7;	\
98*4882a593Smuzhiyun8 ## n ## 5:			\
99*4882a593Smuzhiyun	stw	r8,8(r6);	\
100*4882a593Smuzhiyun	adde	r12,r12,r8;	\
101*4882a593Smuzhiyun8 ## n ## 6:			\
102*4882a593Smuzhiyun	stw	r9,12(r6);	\
103*4882a593Smuzhiyun	adde	r12,r12,r9;	\
104*4882a593Smuzhiyun8 ## n ## 7:			\
105*4882a593Smuzhiyun	stwu	r10,16(r6);	\
106*4882a593Smuzhiyun	adde	r12,r12,r10
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun#define CSUM_COPY_16_BYTES_EXCODE(n)		\
109*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 0b, fault);	\
110*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 1b, fault);	\
111*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 2b, fault);	\
112*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 3b, fault);	\
113*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 4b, fault);	\
114*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 5b, fault);	\
115*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 6b, fault);	\
116*4882a593Smuzhiyun	EX_TABLE(8 ## n ## 7b, fault);
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun	.text
119*4882a593Smuzhiyun	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
120*4882a593Smuzhiyun	.stabs	"checksum_32.S",N_SO,0,0,0f
121*4882a593Smuzhiyun0:
122*4882a593Smuzhiyun
123*4882a593SmuzhiyunCACHELINE_BYTES = L1_CACHE_BYTES
124*4882a593SmuzhiyunLG_CACHELINE_BYTES = L1_CACHE_SHIFT
125*4882a593SmuzhiyunCACHELINE_MASK = (L1_CACHE_BYTES-1)
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun_GLOBAL(csum_partial_copy_generic)
128*4882a593Smuzhiyun	li	r12,-1
129*4882a593Smuzhiyun	addic	r0,r0,0			/* clear carry */
130*4882a593Smuzhiyun	addi	r6,r4,-4
131*4882a593Smuzhiyun	neg	r0,r4
132*4882a593Smuzhiyun	addi	r4,r3,-4
133*4882a593Smuzhiyun	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
134*4882a593Smuzhiyun	crset	4*cr7+eq
135*4882a593Smuzhiyun	beq	58f
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun	cmplw	0,r5,r0			/* is this more than total to do? */
138*4882a593Smuzhiyun	blt	63f			/* if not much to do */
139*4882a593Smuzhiyun	rlwinm	r7,r6,3,0x8
140*4882a593Smuzhiyun	rlwnm	r12,r12,r7,0,31	/* odd destination address: rotate one byte */
141*4882a593Smuzhiyun	cmplwi	cr7,r7,0	/* is destination address even ? */
142*4882a593Smuzhiyun	andi.	r8,r0,3			/* get it word-aligned first */
143*4882a593Smuzhiyun	mtctr	r8
144*4882a593Smuzhiyun	beq+	61f
145*4882a593Smuzhiyun	li	r3,0
146*4882a593Smuzhiyun70:	lbz	r9,4(r4)		/* do some bytes */
147*4882a593Smuzhiyun	addi	r4,r4,1
148*4882a593Smuzhiyun	slwi	r3,r3,8
149*4882a593Smuzhiyun	rlwimi	r3,r9,0,24,31
150*4882a593Smuzhiyun71:	stb	r9,4(r6)
151*4882a593Smuzhiyun	addi	r6,r6,1
152*4882a593Smuzhiyun	bdnz	70b
153*4882a593Smuzhiyun	adde	r12,r12,r3
154*4882a593Smuzhiyun61:	subf	r5,r0,r5
155*4882a593Smuzhiyun	srwi.	r0,r0,2
156*4882a593Smuzhiyun	mtctr	r0
157*4882a593Smuzhiyun	beq	58f
158*4882a593Smuzhiyun72:	lwzu	r9,4(r4)		/* do some words */
159*4882a593Smuzhiyun	adde	r12,r12,r9
160*4882a593Smuzhiyun73:	stwu	r9,4(r6)
161*4882a593Smuzhiyun	bdnz	72b
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun58:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
164*4882a593Smuzhiyun	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
165*4882a593Smuzhiyun	li	r11,4
166*4882a593Smuzhiyun	beq	63f
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	/* Here we decide how far ahead to prefetch the source */
169*4882a593Smuzhiyun	li	r3,4
170*4882a593Smuzhiyun	cmpwi	r0,1
171*4882a593Smuzhiyun	li	r7,0
172*4882a593Smuzhiyun	ble	114f
173*4882a593Smuzhiyun	li	r7,1
174*4882a593Smuzhiyun#if MAX_COPY_PREFETCH > 1
175*4882a593Smuzhiyun	/* Heuristically, for large transfers we prefetch
176*4882a593Smuzhiyun	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
177*4882a593Smuzhiyun	   we prefetch 1 cacheline ahead. */
178*4882a593Smuzhiyun	cmpwi	r0,MAX_COPY_PREFETCH
179*4882a593Smuzhiyun	ble	112f
180*4882a593Smuzhiyun	li	r7,MAX_COPY_PREFETCH
181*4882a593Smuzhiyun112:	mtctr	r7
182*4882a593Smuzhiyun111:	dcbt	r3,r4
183*4882a593Smuzhiyun	addi	r3,r3,CACHELINE_BYTES
184*4882a593Smuzhiyun	bdnz	111b
185*4882a593Smuzhiyun#else
186*4882a593Smuzhiyun	dcbt	r3,r4
187*4882a593Smuzhiyun	addi	r3,r3,CACHELINE_BYTES
188*4882a593Smuzhiyun#endif /* MAX_COPY_PREFETCH > 1 */
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun114:	subf	r8,r7,r0
191*4882a593Smuzhiyun	mr	r0,r7
192*4882a593Smuzhiyun	mtctr	r8
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun53:	dcbt	r3,r4
195*4882a593Smuzhiyun54:	dcbz	r11,r6
196*4882a593Smuzhiyun/* the main body of the cacheline loop */
197*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(0)
198*4882a593Smuzhiyun#if L1_CACHE_BYTES >= 32
199*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(1)
200*4882a593Smuzhiyun#if L1_CACHE_BYTES >= 64
201*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(2)
202*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(3)
203*4882a593Smuzhiyun#if L1_CACHE_BYTES >= 128
204*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(4)
205*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(5)
206*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(6)
207*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_WITHEX(7)
208*4882a593Smuzhiyun#endif
209*4882a593Smuzhiyun#endif
210*4882a593Smuzhiyun#endif
211*4882a593Smuzhiyun	bdnz	53b
212*4882a593Smuzhiyun	cmpwi	r0,0
213*4882a593Smuzhiyun	li	r3,4
214*4882a593Smuzhiyun	li	r7,0
215*4882a593Smuzhiyun	bne	114b
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun63:	srwi.	r0,r5,2
218*4882a593Smuzhiyun	mtctr	r0
219*4882a593Smuzhiyun	beq	64f
220*4882a593Smuzhiyun30:	lwzu	r0,4(r4)
221*4882a593Smuzhiyun	adde	r12,r12,r0
222*4882a593Smuzhiyun31:	stwu	r0,4(r6)
223*4882a593Smuzhiyun	bdnz	30b
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun64:	andi.	r0,r5,2
226*4882a593Smuzhiyun	beq+	65f
227*4882a593Smuzhiyun40:	lhz	r0,4(r4)
228*4882a593Smuzhiyun	addi	r4,r4,2
229*4882a593Smuzhiyun41:	sth	r0,4(r6)
230*4882a593Smuzhiyun	adde	r12,r12,r0
231*4882a593Smuzhiyun	addi	r6,r6,2
232*4882a593Smuzhiyun65:	andi.	r0,r5,1
233*4882a593Smuzhiyun	beq+	66f
234*4882a593Smuzhiyun50:	lbz	r0,4(r4)
235*4882a593Smuzhiyun51:	stb	r0,4(r6)
236*4882a593Smuzhiyun	slwi	r0,r0,8
237*4882a593Smuzhiyun	adde	r12,r12,r0
238*4882a593Smuzhiyun66:	addze	r3,r12
239*4882a593Smuzhiyun	beqlr+	cr7
240*4882a593Smuzhiyun	rlwinm	r3,r3,8,0,31	/* odd destination address: rotate one byte */
241*4882a593Smuzhiyun	blr
242*4882a593Smuzhiyun
243*4882a593Smuzhiyunfault:
244*4882a593Smuzhiyun	li	r3,0
245*4882a593Smuzhiyun	blr
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	EX_TABLE(70b, fault);
248*4882a593Smuzhiyun	EX_TABLE(71b, fault);
249*4882a593Smuzhiyun	EX_TABLE(72b, fault);
250*4882a593Smuzhiyun	EX_TABLE(73b, fault);
251*4882a593Smuzhiyun	EX_TABLE(54b, fault);
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun/*
254*4882a593Smuzhiyun * this stuff handles faults in the cacheline loop and branches to either
255*4882a593Smuzhiyun * fault (if in read part) or fault (if in write part)
256*4882a593Smuzhiyun */
257*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(0)
258*4882a593Smuzhiyun#if L1_CACHE_BYTES >= 32
259*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(1)
260*4882a593Smuzhiyun#if L1_CACHE_BYTES >= 64
261*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(2)
262*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(3)
263*4882a593Smuzhiyun#if L1_CACHE_BYTES >= 128
264*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(4)
265*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(5)
266*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(6)
267*4882a593Smuzhiyun	CSUM_COPY_16_BYTES_EXCODE(7)
268*4882a593Smuzhiyun#endif
269*4882a593Smuzhiyun#endif
270*4882a593Smuzhiyun#endif
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun	EX_TABLE(30b, fault);
273*4882a593Smuzhiyun	EX_TABLE(31b, fault);
274*4882a593Smuzhiyun	EX_TABLE(40b, fault);
275*4882a593Smuzhiyun	EX_TABLE(41b, fault);
276*4882a593Smuzhiyun	EX_TABLE(50b, fault);
277*4882a593Smuzhiyun	EX_TABLE(51b, fault);
278*4882a593Smuzhiyun
279*4882a593SmuzhiyunEXPORT_SYMBOL(csum_partial_copy_generic)
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun/*
282*4882a593Smuzhiyun * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
283*4882a593Smuzhiyun *			   const struct in6_addr *daddr,
284*4882a593Smuzhiyun *			   __u32 len, __u8 proto, __wsum sum)
285*4882a593Smuzhiyun */
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun_GLOBAL(csum_ipv6_magic)
288*4882a593Smuzhiyun	lwz	r8, 0(r3)
289*4882a593Smuzhiyun	lwz	r9, 4(r3)
290*4882a593Smuzhiyun	addc	r0, r7, r8
291*4882a593Smuzhiyun	lwz	r10, 8(r3)
292*4882a593Smuzhiyun	adde	r0, r0, r9
293*4882a593Smuzhiyun	lwz	r11, 12(r3)
294*4882a593Smuzhiyun	adde	r0, r0, r10
295*4882a593Smuzhiyun	lwz	r8, 0(r4)
296*4882a593Smuzhiyun	adde	r0, r0, r11
297*4882a593Smuzhiyun	lwz	r9, 4(r4)
298*4882a593Smuzhiyun	adde	r0, r0, r8
299*4882a593Smuzhiyun	lwz	r10, 8(r4)
300*4882a593Smuzhiyun	adde	r0, r0, r9
301*4882a593Smuzhiyun	lwz	r11, 12(r4)
302*4882a593Smuzhiyun	adde	r0, r0, r10
303*4882a593Smuzhiyun	add	r5, r5, r6	/* assumption: len + proto doesn't carry */
304*4882a593Smuzhiyun	adde	r0, r0, r11
305*4882a593Smuzhiyun	adde	r0, r0, r5
306*4882a593Smuzhiyun	addze	r0, r0
307*4882a593Smuzhiyun	rotlwi	r3, r0, 16
308*4882a593Smuzhiyun	add	r3, r0, r3
309*4882a593Smuzhiyun	not	r3, r3
310*4882a593Smuzhiyun	rlwinm	r3, r3, 16, 16, 31
311*4882a593Smuzhiyun	blr
312*4882a593SmuzhiyunEXPORT_SYMBOL(csum_ipv6_magic)
313