xref: /OK3568_Linux_fs/kernel/arch/x86/lib/checksum_32.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * INET		An implementation of the TCP/IP protocol suite for the LINUX
4*4882a593Smuzhiyun *		operating system.  INET is implemented using the  BSD Socket
5*4882a593Smuzhiyun *		interface as the means of communication with the user level.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun *		IP/TCP/UDP checksumming routines
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
10*4882a593Smuzhiyun *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
11*4882a593Smuzhiyun *		Tom May, <ftom@netcom.com>
12*4882a593Smuzhiyun *              Pentium Pro/II routines:
13*4882a593Smuzhiyun *              Alexander Kjeldaas <astor@guardian.no>
14*4882a593Smuzhiyun *              Finn Arne Gangstad <finnag@guardian.no>
15*4882a593Smuzhiyun *		Lots of code moved from tcp.c and ip.c; see those files
16*4882a593Smuzhiyun *		for more names.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
19*4882a593Smuzhiyun *			     handling.
20*4882a593Smuzhiyun *		Andi Kleen,  add zeroing on error
21*4882a593Smuzhiyun *                   converted to pure assembler
22*4882a593Smuzhiyun */
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun#include <linux/linkage.h>
25*4882a593Smuzhiyun#include <asm/errno.h>
26*4882a593Smuzhiyun#include <asm/asm.h>
27*4882a593Smuzhiyun#include <asm/export.h>
28*4882a593Smuzhiyun#include <asm/nospec-branch.h>
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun/*
31*4882a593Smuzhiyun * computes a partial checksum, e.g. for TCP/UDP fragments
32*4882a593Smuzhiyun */
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun/*
35*4882a593Smuzhiyununsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
36*4882a593Smuzhiyun */
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun.text
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	  /*
43*4882a593Smuzhiyun	   * Experiments with Ethernet and SLIP connections show that buff
44*4882a593Smuzhiyun	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
45*4882a593Smuzhiyun	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
46*4882a593Smuzhiyun	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
47*4882a593Smuzhiyun	   * alignment for the unrolled loop.
48*4882a593Smuzhiyun	   */
49*4882a593SmuzhiyunSYM_FUNC_START(csum_partial)
50*4882a593Smuzhiyun	pushl %esi
51*4882a593Smuzhiyun	pushl %ebx
52*4882a593Smuzhiyun	movl 20(%esp),%eax	# Function arg: unsigned int sum
53*4882a593Smuzhiyun	movl 16(%esp),%ecx	# Function arg: int len
54*4882a593Smuzhiyun	movl 12(%esp),%esi	# Function arg: unsigned char *buff
55*4882a593Smuzhiyun	testl $3, %esi		# Check alignment.
56*4882a593Smuzhiyun	jz 2f			# Jump if alignment is ok.
57*4882a593Smuzhiyun	testl $1, %esi		# Check alignment.
58*4882a593Smuzhiyun	jz 10f			# Jump if alignment is boundary of 2 bytes.
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun	# buf is odd
61*4882a593Smuzhiyun	dec %ecx
62*4882a593Smuzhiyun	jl 8f
63*4882a593Smuzhiyun	movzbl (%esi), %ebx
64*4882a593Smuzhiyun	adcl %ebx, %eax
65*4882a593Smuzhiyun	roll $8, %eax
66*4882a593Smuzhiyun	inc %esi
67*4882a593Smuzhiyun	testl $2, %esi
68*4882a593Smuzhiyun	jz 2f
69*4882a593Smuzhiyun10:
70*4882a593Smuzhiyun	subl $2, %ecx		# Alignment uses up two bytes.
71*4882a593Smuzhiyun	jae 1f			# Jump if we had at least two bytes.
72*4882a593Smuzhiyun	addl $2, %ecx		# ecx was < 2.  Deal with it.
73*4882a593Smuzhiyun	jmp 4f
74*4882a593Smuzhiyun1:	movw (%esi), %bx
75*4882a593Smuzhiyun	addl $2, %esi
76*4882a593Smuzhiyun	addw %bx, %ax
77*4882a593Smuzhiyun	adcl $0, %eax
78*4882a593Smuzhiyun2:
79*4882a593Smuzhiyun	movl %ecx, %edx
80*4882a593Smuzhiyun	shrl $5, %ecx
81*4882a593Smuzhiyun	jz 2f
82*4882a593Smuzhiyun	testl %esi, %esi
83*4882a593Smuzhiyun1:	movl (%esi), %ebx
84*4882a593Smuzhiyun	adcl %ebx, %eax
85*4882a593Smuzhiyun	movl 4(%esi), %ebx
86*4882a593Smuzhiyun	adcl %ebx, %eax
87*4882a593Smuzhiyun	movl 8(%esi), %ebx
88*4882a593Smuzhiyun	adcl %ebx, %eax
89*4882a593Smuzhiyun	movl 12(%esi), %ebx
90*4882a593Smuzhiyun	adcl %ebx, %eax
91*4882a593Smuzhiyun	movl 16(%esi), %ebx
92*4882a593Smuzhiyun	adcl %ebx, %eax
93*4882a593Smuzhiyun	movl 20(%esi), %ebx
94*4882a593Smuzhiyun	adcl %ebx, %eax
95*4882a593Smuzhiyun	movl 24(%esi), %ebx
96*4882a593Smuzhiyun	adcl %ebx, %eax
97*4882a593Smuzhiyun	movl 28(%esi), %ebx
98*4882a593Smuzhiyun	adcl %ebx, %eax
99*4882a593Smuzhiyun	lea 32(%esi), %esi
100*4882a593Smuzhiyun	dec %ecx
101*4882a593Smuzhiyun	jne 1b
102*4882a593Smuzhiyun	adcl $0, %eax
103*4882a593Smuzhiyun2:	movl %edx, %ecx
104*4882a593Smuzhiyun	andl $0x1c, %edx
105*4882a593Smuzhiyun	je 4f
106*4882a593Smuzhiyun	shrl $2, %edx		# This clears CF
107*4882a593Smuzhiyun3:	adcl (%esi), %eax
108*4882a593Smuzhiyun	lea 4(%esi), %esi
109*4882a593Smuzhiyun	dec %edx
110*4882a593Smuzhiyun	jne 3b
111*4882a593Smuzhiyun	adcl $0, %eax
112*4882a593Smuzhiyun4:	andl $3, %ecx
113*4882a593Smuzhiyun	jz 7f
114*4882a593Smuzhiyun	cmpl $2, %ecx
115*4882a593Smuzhiyun	jb 5f
116*4882a593Smuzhiyun	movw (%esi),%cx
117*4882a593Smuzhiyun	leal 2(%esi),%esi
118*4882a593Smuzhiyun	je 6f
119*4882a593Smuzhiyun	shll $16,%ecx
120*4882a593Smuzhiyun5:	movb (%esi),%cl
121*4882a593Smuzhiyun6:	addl %ecx,%eax
122*4882a593Smuzhiyun	adcl $0, %eax
123*4882a593Smuzhiyun7:
124*4882a593Smuzhiyun	testb $1, 12(%esp)
125*4882a593Smuzhiyun	jz 8f
126*4882a593Smuzhiyun	roll $8, %eax
127*4882a593Smuzhiyun8:
128*4882a593Smuzhiyun	popl %ebx
129*4882a593Smuzhiyun	popl %esi
130*4882a593Smuzhiyun	RET
131*4882a593SmuzhiyunSYM_FUNC_END(csum_partial)
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun#else
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun/* Version for PentiumII/PPro */
136*4882a593Smuzhiyun
137*4882a593SmuzhiyunSYM_FUNC_START(csum_partial)
138*4882a593Smuzhiyun	pushl %esi
139*4882a593Smuzhiyun	pushl %ebx
140*4882a593Smuzhiyun	movl 20(%esp),%eax	# Function arg: unsigned int sum
141*4882a593Smuzhiyun	movl 16(%esp),%ecx	# Function arg: int len
142*4882a593Smuzhiyun	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun	testl $3, %esi
145*4882a593Smuzhiyun	jnz 25f
146*4882a593Smuzhiyun10:
147*4882a593Smuzhiyun	movl %ecx, %edx
148*4882a593Smuzhiyun	movl %ecx, %ebx
149*4882a593Smuzhiyun	andl $0x7c, %ebx
150*4882a593Smuzhiyun	shrl $7, %ecx
151*4882a593Smuzhiyun	addl %ebx,%esi
152*4882a593Smuzhiyun	shrl $2, %ebx
153*4882a593Smuzhiyun	negl %ebx
154*4882a593Smuzhiyun	lea 45f(%ebx,%ebx,2), %ebx
155*4882a593Smuzhiyun	testl %esi, %esi
156*4882a593Smuzhiyun	JMP_NOSPEC ebx
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun	# Handle 2-byte-aligned regions
159*4882a593Smuzhiyun20:	addw (%esi), %ax
160*4882a593Smuzhiyun	lea 2(%esi), %esi
161*4882a593Smuzhiyun	adcl $0, %eax
162*4882a593Smuzhiyun	jmp 10b
163*4882a593Smuzhiyun25:
164*4882a593Smuzhiyun	testl $1, %esi
165*4882a593Smuzhiyun	jz 30f
166*4882a593Smuzhiyun	# buf is odd
167*4882a593Smuzhiyun	dec %ecx
168*4882a593Smuzhiyun	jl 90f
169*4882a593Smuzhiyun	movzbl (%esi), %ebx
170*4882a593Smuzhiyun	addl %ebx, %eax
171*4882a593Smuzhiyun	adcl $0, %eax
172*4882a593Smuzhiyun	roll $8, %eax
173*4882a593Smuzhiyun	inc %esi
174*4882a593Smuzhiyun	testl $2, %esi
175*4882a593Smuzhiyun	jz 10b
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun30:	subl $2, %ecx
178*4882a593Smuzhiyun	ja 20b
179*4882a593Smuzhiyun	je 32f
180*4882a593Smuzhiyun	addl $2, %ecx
181*4882a593Smuzhiyun	jz 80f
182*4882a593Smuzhiyun	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
183*4882a593Smuzhiyun	addl %ebx, %eax
184*4882a593Smuzhiyun	adcl $0, %eax
185*4882a593Smuzhiyun	jmp 80f
186*4882a593Smuzhiyun32:
187*4882a593Smuzhiyun	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
188*4882a593Smuzhiyun	adcl $0, %eax
189*4882a593Smuzhiyun	jmp 80f
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun40:
192*4882a593Smuzhiyun	addl -128(%esi), %eax
193*4882a593Smuzhiyun	adcl -124(%esi), %eax
194*4882a593Smuzhiyun	adcl -120(%esi), %eax
195*4882a593Smuzhiyun	adcl -116(%esi), %eax
196*4882a593Smuzhiyun	adcl -112(%esi), %eax
197*4882a593Smuzhiyun	adcl -108(%esi), %eax
198*4882a593Smuzhiyun	adcl -104(%esi), %eax
199*4882a593Smuzhiyun	adcl -100(%esi), %eax
200*4882a593Smuzhiyun	adcl -96(%esi), %eax
201*4882a593Smuzhiyun	adcl -92(%esi), %eax
202*4882a593Smuzhiyun	adcl -88(%esi), %eax
203*4882a593Smuzhiyun	adcl -84(%esi), %eax
204*4882a593Smuzhiyun	adcl -80(%esi), %eax
205*4882a593Smuzhiyun	adcl -76(%esi), %eax
206*4882a593Smuzhiyun	adcl -72(%esi), %eax
207*4882a593Smuzhiyun	adcl -68(%esi), %eax
208*4882a593Smuzhiyun	adcl -64(%esi), %eax
209*4882a593Smuzhiyun	adcl -60(%esi), %eax
210*4882a593Smuzhiyun	adcl -56(%esi), %eax
211*4882a593Smuzhiyun	adcl -52(%esi), %eax
212*4882a593Smuzhiyun	adcl -48(%esi), %eax
213*4882a593Smuzhiyun	adcl -44(%esi), %eax
214*4882a593Smuzhiyun	adcl -40(%esi), %eax
215*4882a593Smuzhiyun	adcl -36(%esi), %eax
216*4882a593Smuzhiyun	adcl -32(%esi), %eax
217*4882a593Smuzhiyun	adcl -28(%esi), %eax
218*4882a593Smuzhiyun	adcl -24(%esi), %eax
219*4882a593Smuzhiyun	adcl -20(%esi), %eax
220*4882a593Smuzhiyun	adcl -16(%esi), %eax
221*4882a593Smuzhiyun	adcl -12(%esi), %eax
222*4882a593Smuzhiyun	adcl -8(%esi), %eax
223*4882a593Smuzhiyun	adcl -4(%esi), %eax
224*4882a593Smuzhiyun45:
225*4882a593Smuzhiyun	lea 128(%esi), %esi
226*4882a593Smuzhiyun	adcl $0, %eax
227*4882a593Smuzhiyun	dec %ecx
228*4882a593Smuzhiyun	jge 40b
229*4882a593Smuzhiyun	movl %edx, %ecx
230*4882a593Smuzhiyun50:	andl $3, %ecx
231*4882a593Smuzhiyun	jz 80f
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun	# Handle the last 1-3 bytes without jumping
234*4882a593Smuzhiyun	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
235*4882a593Smuzhiyun	movl $0xffffff,%ebx	# by the shll and shrl instructions
236*4882a593Smuzhiyun	shll $3,%ecx
237*4882a593Smuzhiyun	shrl %cl,%ebx
238*4882a593Smuzhiyun	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
239*4882a593Smuzhiyun	addl %ebx,%eax
240*4882a593Smuzhiyun	adcl $0,%eax
241*4882a593Smuzhiyun80:
242*4882a593Smuzhiyun	testb $1, 12(%esp)
243*4882a593Smuzhiyun	jz 90f
244*4882a593Smuzhiyun	roll $8, %eax
245*4882a593Smuzhiyun90:
246*4882a593Smuzhiyun	popl %ebx
247*4882a593Smuzhiyun	popl %esi
248*4882a593Smuzhiyun	RET
249*4882a593SmuzhiyunSYM_FUNC_END(csum_partial)
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun#endif
252*4882a593SmuzhiyunEXPORT_SYMBOL(csum_partial)
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun/*
255*4882a593Smuzhiyununsigned int csum_partial_copy_generic (const char *src, char *dst,
256*4882a593Smuzhiyun				  int len)
257*4882a593Smuzhiyun */
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun/*
260*4882a593Smuzhiyun * Copy from ds while checksumming, otherwise like csum_partial
261*4882a593Smuzhiyun */
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun#define EXC(y...)			\
264*4882a593Smuzhiyun	9999: y;			\
265*4882a593Smuzhiyun	_ASM_EXTABLE_UA(9999b, 6001f)
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun#define ARGBASE 16
270*4882a593Smuzhiyun#define FP		12
271*4882a593Smuzhiyun
272*4882a593SmuzhiyunSYM_FUNC_START(csum_partial_copy_generic)
273*4882a593Smuzhiyun	subl  $4,%esp
274*4882a593Smuzhiyun	pushl %edi
275*4882a593Smuzhiyun	pushl %esi
276*4882a593Smuzhiyun	pushl %ebx
277*4882a593Smuzhiyun	movl ARGBASE+12(%esp),%ecx	# len
278*4882a593Smuzhiyun	movl ARGBASE+4(%esp),%esi	# src
279*4882a593Smuzhiyun	movl ARGBASE+8(%esp),%edi	# dst
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun	movl $-1, %eax			# sum
282*4882a593Smuzhiyun	testl $2, %edi			# Check alignment.
283*4882a593Smuzhiyun	jz 2f				# Jump if alignment is ok.
284*4882a593Smuzhiyun	subl $2, %ecx			# Alignment uses up two bytes.
285*4882a593Smuzhiyun	jae 1f				# Jump if we had at least two bytes.
286*4882a593Smuzhiyun	addl $2, %ecx			# ecx was < 2.  Deal with it.
287*4882a593Smuzhiyun	jmp 4f
288*4882a593SmuzhiyunEXC(1:	movw (%esi), %bx	)
289*4882a593Smuzhiyun	addl $2, %esi
290*4882a593SmuzhiyunEXC(	movw %bx, (%edi)	)
291*4882a593Smuzhiyun	addl $2, %edi
292*4882a593Smuzhiyun	addw %bx, %ax
293*4882a593Smuzhiyun	adcl $0, %eax
294*4882a593Smuzhiyun2:
295*4882a593Smuzhiyun	movl %ecx, FP(%esp)
296*4882a593Smuzhiyun	shrl $5, %ecx
297*4882a593Smuzhiyun	jz 2f
298*4882a593Smuzhiyun	testl %esi, %esi		# what's wrong with clc?
299*4882a593SmuzhiyunEXC(1:	movl (%esi), %ebx	)
300*4882a593SmuzhiyunEXC(	movl 4(%esi), %edx	)
301*4882a593Smuzhiyun	adcl %ebx, %eax
302*4882a593SmuzhiyunEXC(	movl %ebx, (%edi)	)
303*4882a593Smuzhiyun	adcl %edx, %eax
304*4882a593SmuzhiyunEXC(	movl %edx, 4(%edi)	)
305*4882a593Smuzhiyun
306*4882a593SmuzhiyunEXC(	movl 8(%esi), %ebx	)
307*4882a593SmuzhiyunEXC(	movl 12(%esi), %edx	)
308*4882a593Smuzhiyun	adcl %ebx, %eax
309*4882a593SmuzhiyunEXC(	movl %ebx, 8(%edi)	)
310*4882a593Smuzhiyun	adcl %edx, %eax
311*4882a593SmuzhiyunEXC(	movl %edx, 12(%edi)	)
312*4882a593Smuzhiyun
313*4882a593SmuzhiyunEXC(	movl 16(%esi), %ebx 	)
314*4882a593SmuzhiyunEXC(	movl 20(%esi), %edx	)
315*4882a593Smuzhiyun	adcl %ebx, %eax
316*4882a593SmuzhiyunEXC(	movl %ebx, 16(%edi)	)
317*4882a593Smuzhiyun	adcl %edx, %eax
318*4882a593SmuzhiyunEXC(	movl %edx, 20(%edi)	)
319*4882a593Smuzhiyun
320*4882a593SmuzhiyunEXC(	movl 24(%esi), %ebx	)
321*4882a593SmuzhiyunEXC(	movl 28(%esi), %edx	)
322*4882a593Smuzhiyun	adcl %ebx, %eax
323*4882a593SmuzhiyunEXC(	movl %ebx, 24(%edi)	)
324*4882a593Smuzhiyun	adcl %edx, %eax
325*4882a593SmuzhiyunEXC(	movl %edx, 28(%edi)	)
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun	lea 32(%esi), %esi
328*4882a593Smuzhiyun	lea 32(%edi), %edi
329*4882a593Smuzhiyun	dec %ecx
330*4882a593Smuzhiyun	jne 1b
331*4882a593Smuzhiyun	adcl $0, %eax
332*4882a593Smuzhiyun2:	movl FP(%esp), %edx
333*4882a593Smuzhiyun	movl %edx, %ecx
334*4882a593Smuzhiyun	andl $0x1c, %edx
335*4882a593Smuzhiyun	je 4f
336*4882a593Smuzhiyun	shrl $2, %edx			# This clears CF
337*4882a593SmuzhiyunEXC(3:	movl (%esi), %ebx	)
338*4882a593Smuzhiyun	adcl %ebx, %eax
339*4882a593SmuzhiyunEXC(	movl %ebx, (%edi)	)
340*4882a593Smuzhiyun	lea 4(%esi), %esi
341*4882a593Smuzhiyun	lea 4(%edi), %edi
342*4882a593Smuzhiyun	dec %edx
343*4882a593Smuzhiyun	jne 3b
344*4882a593Smuzhiyun	adcl $0, %eax
345*4882a593Smuzhiyun4:	andl $3, %ecx
346*4882a593Smuzhiyun	jz 7f
347*4882a593Smuzhiyun	cmpl $2, %ecx
348*4882a593Smuzhiyun	jb 5f
349*4882a593SmuzhiyunEXC(	movw (%esi), %cx	)
350*4882a593Smuzhiyun	leal 2(%esi), %esi
351*4882a593SmuzhiyunEXC(	movw %cx, (%edi)	)
352*4882a593Smuzhiyun	leal 2(%edi), %edi
353*4882a593Smuzhiyun	je 6f
354*4882a593Smuzhiyun	shll $16,%ecx
355*4882a593SmuzhiyunEXC(5:	movb (%esi), %cl	)
356*4882a593SmuzhiyunEXC(	movb %cl, (%edi)	)
357*4882a593Smuzhiyun6:	addl %ecx, %eax
358*4882a593Smuzhiyun	adcl $0, %eax
359*4882a593Smuzhiyun7:
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun# Exception handler:
362*4882a593Smuzhiyun.section .fixup, "ax"
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun6001:
365*4882a593Smuzhiyun	xorl %eax, %eax
366*4882a593Smuzhiyun	jmp 7b
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun.previous
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun	popl %ebx
371*4882a593Smuzhiyun	popl %esi
372*4882a593Smuzhiyun	popl %edi
373*4882a593Smuzhiyun	popl %ecx			# equivalent to addl $4,%esp
374*4882a593Smuzhiyun	RET
375*4882a593SmuzhiyunSYM_FUNC_END(csum_partial_copy_generic)
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun#else
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun/* Version for PentiumII/PPro */
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun#define ROUND1(x) \
382*4882a593Smuzhiyun	EXC(movl x(%esi), %ebx	)	;	\
383*4882a593Smuzhiyun	addl %ebx, %eax			;	\
384*4882a593Smuzhiyun	EXC(movl %ebx, x(%edi)	)	;
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun#define ROUND(x) \
387*4882a593Smuzhiyun	EXC(movl x(%esi), %ebx	)	;	\
388*4882a593Smuzhiyun	adcl %ebx, %eax			;	\
389*4882a593Smuzhiyun	EXC(movl %ebx, x(%edi)	)	;
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun#define ARGBASE 12
392*4882a593Smuzhiyun
393*4882a593SmuzhiyunSYM_FUNC_START(csum_partial_copy_generic)
394*4882a593Smuzhiyun	pushl %ebx
395*4882a593Smuzhiyun	pushl %edi
396*4882a593Smuzhiyun	pushl %esi
397*4882a593Smuzhiyun	movl ARGBASE+4(%esp),%esi	#src
398*4882a593Smuzhiyun	movl ARGBASE+8(%esp),%edi	#dst
399*4882a593Smuzhiyun	movl ARGBASE+12(%esp),%ecx	#len
400*4882a593Smuzhiyun	movl $-1, %eax			#sum
401*4882a593Smuzhiyun#	movl %ecx, %edx
402*4882a593Smuzhiyun	movl %ecx, %ebx
403*4882a593Smuzhiyun	movl %esi, %edx
404*4882a593Smuzhiyun	shrl $6, %ecx
405*4882a593Smuzhiyun	andl $0x3c, %ebx
406*4882a593Smuzhiyun	negl %ebx
407*4882a593Smuzhiyun	subl %ebx, %esi
408*4882a593Smuzhiyun	subl %ebx, %edi
409*4882a593Smuzhiyun	lea  -1(%esi),%edx
410*4882a593Smuzhiyun	andl $-32,%edx
411*4882a593Smuzhiyun	lea 3f(%ebx,%ebx), %ebx
412*4882a593Smuzhiyun	testl %esi, %esi
413*4882a593Smuzhiyun	JMP_NOSPEC ebx
414*4882a593Smuzhiyun1:	addl $64,%esi
415*4882a593Smuzhiyun	addl $64,%edi
416*4882a593Smuzhiyun	EXC(movb -32(%edx),%bl)	; EXC(movb (%edx),%bl)
417*4882a593Smuzhiyun	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
418*4882a593Smuzhiyun	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
419*4882a593Smuzhiyun	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
420*4882a593Smuzhiyun	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)
421*4882a593Smuzhiyun3:	adcl $0,%eax
422*4882a593Smuzhiyun	addl $64, %edx
423*4882a593Smuzhiyun	dec %ecx
424*4882a593Smuzhiyun	jge 1b
425*4882a593Smuzhiyun4:	movl ARGBASE+12(%esp),%edx	#len
426*4882a593Smuzhiyun	andl $3, %edx
427*4882a593Smuzhiyun	jz 7f
428*4882a593Smuzhiyun	cmpl $2, %edx
429*4882a593Smuzhiyun	jb 5f
430*4882a593SmuzhiyunEXC(	movw (%esi), %dx         )
431*4882a593Smuzhiyun	leal 2(%esi), %esi
432*4882a593SmuzhiyunEXC(	movw %dx, (%edi)         )
433*4882a593Smuzhiyun	leal 2(%edi), %edi
434*4882a593Smuzhiyun	je 6f
435*4882a593Smuzhiyun	shll $16,%edx
436*4882a593Smuzhiyun5:
437*4882a593SmuzhiyunEXC(	movb (%esi), %dl         )
438*4882a593SmuzhiyunEXC(	movb %dl, (%edi)         )
439*4882a593Smuzhiyun6:	addl %edx, %eax
440*4882a593Smuzhiyun	adcl $0, %eax
441*4882a593Smuzhiyun7:
442*4882a593Smuzhiyun.section .fixup, "ax"
443*4882a593Smuzhiyun6001:	xorl %eax, %eax
444*4882a593Smuzhiyun	jmp  7b
445*4882a593Smuzhiyun.previous
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun	popl %esi
448*4882a593Smuzhiyun	popl %edi
449*4882a593Smuzhiyun	popl %ebx
450*4882a593Smuzhiyun	RET
451*4882a593SmuzhiyunSYM_FUNC_END(csum_partial_copy_generic)
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun#undef ROUND
454*4882a593Smuzhiyun#undef ROUND1
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun#endif
457*4882a593SmuzhiyunEXPORT_SYMBOL(csum_partial_copy_generic)
458