xref: /OK3568_Linux_fs/kernel/arch/xtensa/lib/checksum.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * INET		An implementation of the TCP/IP protocol suite for the LINUX
4*4882a593Smuzhiyun *		operating system.  INET is implemented using the  BSD Socket
5*4882a593Smuzhiyun *		interface as the means of communication with the user level.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun *		IP/TCP/UDP checksumming routines
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
10*4882a593Smuzhiyun *                  Optimized by Joe Taylor
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#include <linux/errno.h>
14*4882a593Smuzhiyun#include <linux/linkage.h>
15*4882a593Smuzhiyun#include <asm/asmmacro.h>
16*4882a593Smuzhiyun#include <asm/core.h>
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun/*
19*4882a593Smuzhiyun * computes a partial checksum, e.g. for TCP/UDP fragments
20*4882a593Smuzhiyun */
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun/*
23*4882a593Smuzhiyun * unsigned int csum_partial(const unsigned char *buf, int len,
24*4882a593Smuzhiyun *                           unsigned int sum);
25*4882a593Smuzhiyun *    a2 = buf
26*4882a593Smuzhiyun *    a3 = len
27*4882a593Smuzhiyun *    a4 = sum
28*4882a593Smuzhiyun *
29*4882a593Smuzhiyun * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
30*4882a593Smuzhiyun */
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun/* ONES_ADD converts twos-complement math to ones-complement. */
33*4882a593Smuzhiyun#define ONES_ADD(sum, val)	  \
34*4882a593Smuzhiyun	add	sum, sum, val	; \
35*4882a593Smuzhiyun	bgeu	sum, val, 99f	; \
36*4882a593Smuzhiyun	addi	sum, sum, 1	; \
37*4882a593Smuzhiyun99:				;
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun.text
40*4882a593SmuzhiyunENTRY(csum_partial)
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	/*
43*4882a593Smuzhiyun	 * Experiments with Ethernet and SLIP connections show that buf
44*4882a593Smuzhiyun	 * is aligned on either a 2-byte or 4-byte boundary.
45*4882a593Smuzhiyun	 */
46*4882a593Smuzhiyun	abi_entry_default
47*4882a593Smuzhiyun	extui	a5, a2, 0, 2
48*4882a593Smuzhiyun	bnez	a5, 8f		/* branch if 2-byte aligned */
49*4882a593Smuzhiyun	/* Fall-through on common case, 4-byte alignment */
50*4882a593Smuzhiyun1:
51*4882a593Smuzhiyun	srli	a5, a3, 5	/* 32-byte chunks */
52*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
53*4882a593Smuzhiyun	loopgtz	a5, 2f
54*4882a593Smuzhiyun#else
55*4882a593Smuzhiyun	beqz	a5, 2f
56*4882a593Smuzhiyun	slli	a5, a5, 5
57*4882a593Smuzhiyun	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
58*4882a593Smuzhiyun.Loop1:
59*4882a593Smuzhiyun#endif
60*4882a593Smuzhiyun	l32i	a6, a2, 0
61*4882a593Smuzhiyun	l32i	a7, a2, 4
62*4882a593Smuzhiyun	ONES_ADD(a4, a6)
63*4882a593Smuzhiyun	ONES_ADD(a4, a7)
64*4882a593Smuzhiyun	l32i	a6, a2, 8
65*4882a593Smuzhiyun	l32i	a7, a2, 12
66*4882a593Smuzhiyun	ONES_ADD(a4, a6)
67*4882a593Smuzhiyun	ONES_ADD(a4, a7)
68*4882a593Smuzhiyun	l32i	a6, a2, 16
69*4882a593Smuzhiyun	l32i	a7, a2, 20
70*4882a593Smuzhiyun	ONES_ADD(a4, a6)
71*4882a593Smuzhiyun	ONES_ADD(a4, a7)
72*4882a593Smuzhiyun	l32i	a6, a2, 24
73*4882a593Smuzhiyun	l32i	a7, a2, 28
74*4882a593Smuzhiyun	ONES_ADD(a4, a6)
75*4882a593Smuzhiyun	ONES_ADD(a4, a7)
76*4882a593Smuzhiyun	addi	a2, a2, 4*8
77*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
78*4882a593Smuzhiyun	blt	a2, a5, .Loop1
79*4882a593Smuzhiyun#endif
80*4882a593Smuzhiyun2:
81*4882a593Smuzhiyun	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
82*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
83*4882a593Smuzhiyun	loopgtz	a5, 3f
84*4882a593Smuzhiyun#else
85*4882a593Smuzhiyun	beqz	a5, 3f
86*4882a593Smuzhiyun	slli	a5, a5, 2
87*4882a593Smuzhiyun	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
88*4882a593Smuzhiyun.Loop2:
89*4882a593Smuzhiyun#endif
90*4882a593Smuzhiyun	l32i	a6, a2, 0
91*4882a593Smuzhiyun	ONES_ADD(a4, a6)
92*4882a593Smuzhiyun	addi	a2, a2, 4
93*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
94*4882a593Smuzhiyun	blt	a2, a5, .Loop2
95*4882a593Smuzhiyun#endif
96*4882a593Smuzhiyun3:
97*4882a593Smuzhiyun	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
98*4882a593Smuzhiyun	l16ui	a6, a2, 0
99*4882a593Smuzhiyun	ONES_ADD(a4, a6)
100*4882a593Smuzhiyun	addi	a2, a2, 2
101*4882a593Smuzhiyun5:
102*4882a593Smuzhiyun	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
103*4882a593Smuzhiyun6:	l8ui	a6, a2, 0
104*4882a593Smuzhiyun#ifdef __XTENSA_EB__
105*4882a593Smuzhiyun	slli	a6, a6, 8	/* load byte into bits 8..15 */
106*4882a593Smuzhiyun#endif
107*4882a593Smuzhiyun	ONES_ADD(a4, a6)
108*4882a593Smuzhiyun7:
109*4882a593Smuzhiyun	mov	a2, a4
110*4882a593Smuzhiyun	abi_ret_default
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun	/* uncommon case, buf is 2-byte aligned */
113*4882a593Smuzhiyun8:
114*4882a593Smuzhiyun	beqz	a3, 7b		/* branch if len == 0 */
115*4882a593Smuzhiyun	beqi	a3, 1, 6b	/* branch if len == 1 */
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun	extui	a5, a2, 0, 1
118*4882a593Smuzhiyun	bnez	a5, 8f		/* branch if 1-byte aligned */
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun	l16ui	a6, a2, 0	/* common case, len >= 2 */
121*4882a593Smuzhiyun	ONES_ADD(a4, a6)
122*4882a593Smuzhiyun	addi	a2, a2, 2	/* adjust buf */
123*4882a593Smuzhiyun	addi	a3, a3, -2	/* adjust len */
124*4882a593Smuzhiyun	j	1b		/* now buf is 4-byte aligned */
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun	/* case: odd-byte aligned, len > 1
127*4882a593Smuzhiyun	 * This case is dog slow, so don't give us an odd address.
128*4882a593Smuzhiyun	 * (I don't think this ever happens, but just in case.)
129*4882a593Smuzhiyun	 */
130*4882a593Smuzhiyun8:
131*4882a593Smuzhiyun	srli	a5, a3, 2	/* 4-byte chunks */
132*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
133*4882a593Smuzhiyun	loopgtz	a5, 2f
134*4882a593Smuzhiyun#else
135*4882a593Smuzhiyun	beqz	a5, 2f
136*4882a593Smuzhiyun	slli	a5, a5, 2
137*4882a593Smuzhiyun	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
138*4882a593Smuzhiyun.Loop3:
139*4882a593Smuzhiyun#endif
140*4882a593Smuzhiyun	l8ui	a6, a2, 0	/* bits 24..31 */
141*4882a593Smuzhiyun	l16ui	a7, a2, 1	/* bits  8..23 */
142*4882a593Smuzhiyun	l8ui	a8, a2, 3	/* bits  0.. 8 */
143*4882a593Smuzhiyun#ifdef	__XTENSA_EB__
144*4882a593Smuzhiyun	slli	a6, a6, 24
145*4882a593Smuzhiyun#else
146*4882a593Smuzhiyun	slli	a8, a8, 24
147*4882a593Smuzhiyun#endif
148*4882a593Smuzhiyun	slli	a7, a7, 8
149*4882a593Smuzhiyun	or	a7, a7, a6
150*4882a593Smuzhiyun	or	a7, a7, a8
151*4882a593Smuzhiyun	ONES_ADD(a4, a7)
152*4882a593Smuzhiyun	addi	a2, a2, 4
153*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
154*4882a593Smuzhiyun	blt	a2, a5, .Loop3
155*4882a593Smuzhiyun#endif
156*4882a593Smuzhiyun2:
157*4882a593Smuzhiyun	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
158*4882a593Smuzhiyun	l8ui	a6, a2, 0
159*4882a593Smuzhiyun	l8ui	a7, a2, 1
160*4882a593Smuzhiyun#ifdef	__XTENSA_EB__
161*4882a593Smuzhiyun	slli	a6, a6, 8
162*4882a593Smuzhiyun#else
163*4882a593Smuzhiyun	slli	a7, a7, 8
164*4882a593Smuzhiyun#endif
165*4882a593Smuzhiyun	or	a7, a7, a6
166*4882a593Smuzhiyun	ONES_ADD(a4, a7)
167*4882a593Smuzhiyun	addi	a2, a2, 2
168*4882a593Smuzhiyun3:
169*4882a593Smuzhiyun	j	5b		/* branch to handle the remaining byte */
170*4882a593Smuzhiyun
171*4882a593SmuzhiyunENDPROC(csum_partial)
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun/*
174*4882a593Smuzhiyun * Copy from ds while checksumming, otherwise like csum_partial
175*4882a593Smuzhiyun */
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun/*
178*4882a593Smuzhiyununsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
179*4882a593Smuzhiyun	a2  = src
180*4882a593Smuzhiyun	a3  = dst
181*4882a593Smuzhiyun	a4  = len
182*4882a593Smuzhiyun	a5  = sum
183*4882a593Smuzhiyun	a8  = temp
184*4882a593Smuzhiyun	a9  = temp
185*4882a593Smuzhiyun	a10 = temp
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun    This function is optimized for 4-byte aligned addresses.  Other
188*4882a593Smuzhiyun    alignments work, but not nearly as efficiently.
189*4882a593Smuzhiyun */
190*4882a593Smuzhiyun
191*4882a593SmuzhiyunENTRY(csum_partial_copy_generic)
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	abi_entry_default
194*4882a593Smuzhiyun	movi	a5, -1
195*4882a593Smuzhiyun	or	a10, a2, a3
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun	/* We optimize the following alignment tests for the 4-byte
198*4882a593Smuzhiyun	aligned case.  Two bbsi.l instructions might seem more optimal
199*4882a593Smuzhiyun	(commented out below).  However, both labels 5: and 3: are out
200*4882a593Smuzhiyun	of the imm8 range, so the assembler relaxes them into
201*4882a593Smuzhiyun	equivalent bbci.l, j combinations, which is actually
202*4882a593Smuzhiyun	slower. */
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun	extui	a9, a10, 0, 2
205*4882a593Smuzhiyun	beqz	a9, 1f		/* branch if both are 4-byte aligned */
206*4882a593Smuzhiyun	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
207*4882a593Smuzhiyun	j	3f		/* one address is 2-byte aligned */
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
210*4882a593Smuzhiyun/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun1:
213*4882a593Smuzhiyun	/* src and dst are both 4-byte aligned */
214*4882a593Smuzhiyun	srli	a10, a4, 5	/* 32-byte chunks */
215*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
216*4882a593Smuzhiyun	loopgtz	a10, 2f
217*4882a593Smuzhiyun#else
218*4882a593Smuzhiyun	beqz	a10, 2f
219*4882a593Smuzhiyun	slli	a10, a10, 5
220*4882a593Smuzhiyun	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
221*4882a593Smuzhiyun.Loop5:
222*4882a593Smuzhiyun#endif
223*4882a593SmuzhiyunEX(10f)	l32i	a9, a2, 0
224*4882a593SmuzhiyunEX(10f)	l32i	a8, a2, 4
225*4882a593SmuzhiyunEX(10f)	s32i	a9, a3, 0
226*4882a593SmuzhiyunEX(10f)	s32i	a8, a3, 4
227*4882a593Smuzhiyun	ONES_ADD(a5, a9)
228*4882a593Smuzhiyun	ONES_ADD(a5, a8)
229*4882a593SmuzhiyunEX(10f)	l32i	a9, a2, 8
230*4882a593SmuzhiyunEX(10f)	l32i	a8, a2, 12
231*4882a593SmuzhiyunEX(10f)	s32i	a9, a3, 8
232*4882a593SmuzhiyunEX(10f)	s32i	a8, a3, 12
233*4882a593Smuzhiyun	ONES_ADD(a5, a9)
234*4882a593Smuzhiyun	ONES_ADD(a5, a8)
235*4882a593SmuzhiyunEX(10f)	l32i	a9, a2, 16
236*4882a593SmuzhiyunEX(10f)	l32i	a8, a2, 20
237*4882a593SmuzhiyunEX(10f)	s32i	a9, a3, 16
238*4882a593SmuzhiyunEX(10f)	s32i	a8, a3, 20
239*4882a593Smuzhiyun	ONES_ADD(a5, a9)
240*4882a593Smuzhiyun	ONES_ADD(a5, a8)
241*4882a593SmuzhiyunEX(10f)	l32i	a9, a2, 24
242*4882a593SmuzhiyunEX(10f)	l32i	a8, a2, 28
243*4882a593SmuzhiyunEX(10f)	s32i	a9, a3, 24
244*4882a593SmuzhiyunEX(10f)	s32i	a8, a3, 28
245*4882a593Smuzhiyun	ONES_ADD(a5, a9)
246*4882a593Smuzhiyun	ONES_ADD(a5, a8)
247*4882a593Smuzhiyun	addi	a2, a2, 32
248*4882a593Smuzhiyun	addi	a3, a3, 32
249*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
250*4882a593Smuzhiyun	blt	a2, a10, .Loop5
251*4882a593Smuzhiyun#endif
252*4882a593Smuzhiyun2:
253*4882a593Smuzhiyun	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
254*4882a593Smuzhiyun	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
255*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
256*4882a593Smuzhiyun	loopgtz	a10, 3f
257*4882a593Smuzhiyun#else
258*4882a593Smuzhiyun	beqz	a10, 3f
259*4882a593Smuzhiyun	slli	a10, a10, 2
260*4882a593Smuzhiyun	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
261*4882a593Smuzhiyun.Loop6:
262*4882a593Smuzhiyun#endif
263*4882a593SmuzhiyunEX(10f)	l32i	a9, a2, 0
264*4882a593SmuzhiyunEX(10f)	s32i	a9, a3, 0
265*4882a593Smuzhiyun	ONES_ADD(a5, a9)
266*4882a593Smuzhiyun	addi	a2, a2, 4
267*4882a593Smuzhiyun	addi	a3, a3, 4
268*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
269*4882a593Smuzhiyun	blt	a2, a10, .Loop6
270*4882a593Smuzhiyun#endif
271*4882a593Smuzhiyun3:
272*4882a593Smuzhiyun	/*
273*4882a593Smuzhiyun	Control comes to here in two cases: (1) It may fall through
274*4882a593Smuzhiyun	to here from the 4-byte alignment case to process, at most,
275*4882a593Smuzhiyun	one 2-byte chunk.  (2) It branches to here from above if
276*4882a593Smuzhiyun	either src or dst is 2-byte aligned, and we process all bytes
277*4882a593Smuzhiyun	here, except for perhaps a trailing odd byte.  It's
278*4882a593Smuzhiyun	inefficient, so align your addresses to 4-byte boundaries.
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun	a2 = src
281*4882a593Smuzhiyun	a3 = dst
282*4882a593Smuzhiyun	a4 = len
283*4882a593Smuzhiyun	a5 = sum
284*4882a593Smuzhiyun	*/
285*4882a593Smuzhiyun	srli	a10, a4, 1	/* 2-byte chunks */
286*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
287*4882a593Smuzhiyun	loopgtz	a10, 4f
288*4882a593Smuzhiyun#else
289*4882a593Smuzhiyun	beqz	a10, 4f
290*4882a593Smuzhiyun	slli	a10, a10, 1
291*4882a593Smuzhiyun	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
292*4882a593Smuzhiyun.Loop7:
293*4882a593Smuzhiyun#endif
294*4882a593SmuzhiyunEX(10f)	l16ui	a9, a2, 0
295*4882a593SmuzhiyunEX(10f)	s16i	a9, a3, 0
296*4882a593Smuzhiyun	ONES_ADD(a5, a9)
297*4882a593Smuzhiyun	addi	a2, a2, 2
298*4882a593Smuzhiyun	addi	a3, a3, 2
299*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
300*4882a593Smuzhiyun	blt	a2, a10, .Loop7
301*4882a593Smuzhiyun#endif
302*4882a593Smuzhiyun4:
303*4882a593Smuzhiyun	/* This section processes a possible trailing odd byte. */
304*4882a593Smuzhiyun	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
305*4882a593SmuzhiyunEX(10f)	l8ui	a9, a2, 0
306*4882a593SmuzhiyunEX(10f)	s8i	a9, a3, 0
307*4882a593Smuzhiyun#ifdef __XTENSA_EB__
308*4882a593Smuzhiyun	slli	a9, a9, 8	/* shift byte to bits 8..15 */
309*4882a593Smuzhiyun#endif
310*4882a593Smuzhiyun	ONES_ADD(a5, a9)
311*4882a593Smuzhiyun8:
312*4882a593Smuzhiyun	mov	a2, a5
313*4882a593Smuzhiyun	abi_ret_default
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun5:
316*4882a593Smuzhiyun	/* Control branch to here when either src or dst is odd.  We
317*4882a593Smuzhiyun	process all bytes using 8-bit accesses.  Grossly inefficient,
318*4882a593Smuzhiyun	so don't feed us an odd address. */
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
321*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS
322*4882a593Smuzhiyun	loopgtz	a10, 6f
323*4882a593Smuzhiyun#else
324*4882a593Smuzhiyun	beqz	a10, 6f
325*4882a593Smuzhiyun	slli	a10, a10, 1
326*4882a593Smuzhiyun	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
327*4882a593Smuzhiyun.Loop8:
328*4882a593Smuzhiyun#endif
329*4882a593SmuzhiyunEX(10f)	l8ui	a9, a2, 0
330*4882a593SmuzhiyunEX(10f)	l8ui	a8, a2, 1
331*4882a593SmuzhiyunEX(10f)	s8i	a9, a3, 0
332*4882a593SmuzhiyunEX(10f)	s8i	a8, a3, 1
333*4882a593Smuzhiyun#ifdef __XTENSA_EB__
334*4882a593Smuzhiyun	slli	a9, a9, 8	/* combine into a single 16-bit value */
335*4882a593Smuzhiyun#else				/* for checksum computation */
336*4882a593Smuzhiyun	slli	a8, a8, 8
337*4882a593Smuzhiyun#endif
338*4882a593Smuzhiyun	or	a9, a9, a8
339*4882a593Smuzhiyun	ONES_ADD(a5, a9)
340*4882a593Smuzhiyun	addi	a2, a2, 2
341*4882a593Smuzhiyun	addi	a3, a3, 2
342*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS
343*4882a593Smuzhiyun	blt	a2, a10, .Loop8
344*4882a593Smuzhiyun#endif
345*4882a593Smuzhiyun6:
346*4882a593Smuzhiyun	j	4b		/* process the possible trailing odd byte */
347*4882a593Smuzhiyun
348*4882a593SmuzhiyunENDPROC(csum_partial_copy_generic)
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun# Exception handler:
352*4882a593Smuzhiyun.section .fixup, "ax"
353*4882a593Smuzhiyun10:
354*4882a593Smuzhiyun	movi	a2, 0
355*4882a593Smuzhiyun	abi_ret_default
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun.previous
358