xref: /OK3568_Linux_fs/kernel/arch/x86/um/checksum_32.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * INET		An implementation of the TCP/IP protocol suite for the LINUX
4*4882a593Smuzhiyun *		operating system.  INET is implemented using the  BSD Socket
5*4882a593Smuzhiyun *		interface as the means of communication with the user level.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun *		IP/TCP/UDP checksumming routines
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
10*4882a593Smuzhiyun *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
11*4882a593Smuzhiyun *		Tom May, <ftom@netcom.com>
12*4882a593Smuzhiyun *              Pentium Pro/II routines:
13*4882a593Smuzhiyun *              Alexander Kjeldaas <astor@guardian.no>
14*4882a593Smuzhiyun *              Finn Arne Gangstad <finnag@guardian.no>
15*4882a593Smuzhiyun *		Lots of code moved from tcp.c and ip.c; see those files
16*4882a593Smuzhiyun *		for more names.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
19*4882a593Smuzhiyun *			     handling.
20*4882a593Smuzhiyun *		Andi Kleen,  add zeroing on error
21*4882a593Smuzhiyun *                   converted to pure assembler
22*4882a593Smuzhiyun */
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun#include <asm/errno.h>
25*4882a593Smuzhiyun#include <asm/asm.h>
26*4882a593Smuzhiyun#include <asm/export.h>
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun/*
29*4882a593Smuzhiyun * computes a partial checksum, e.g. for TCP/UDP fragments
30*4882a593Smuzhiyun */
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun/*
33*4882a593Smuzhiyununsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
34*4882a593Smuzhiyun */
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun.text
37*4882a593Smuzhiyun.align 4
38*4882a593Smuzhiyun.globl csum_partial
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	  /*
43*4882a593Smuzhiyun	   * Experiments with Ethernet and SLIP connections show that buff
44*4882a593Smuzhiyun	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
45*4882a593Smuzhiyun	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
46*4882a593Smuzhiyun	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
47*4882a593Smuzhiyun	   * alignment for the unrolled loop.
48*4882a593Smuzhiyun	   */
49*4882a593Smuzhiyuncsum_partial:
50*4882a593Smuzhiyun	pushl %esi
51*4882a593Smuzhiyun	pushl %ebx
52*4882a593Smuzhiyun	movl 20(%esp),%eax	# Function arg: unsigned int sum
53*4882a593Smuzhiyun	movl 16(%esp),%ecx	# Function arg: int len
54*4882a593Smuzhiyun	movl 12(%esp),%esi	# Function arg: unsigned char *buff
55*4882a593Smuzhiyun	testl $2, %esi		# Check alignment.
56*4882a593Smuzhiyun	jz 2f			# Jump if alignment is ok.
57*4882a593Smuzhiyun	subl $2, %ecx		# Alignment uses up two bytes.
58*4882a593Smuzhiyun	jae 1f			# Jump if we had at least two bytes.
59*4882a593Smuzhiyun	addl $2, %ecx		# ecx was < 2.  Deal with it.
60*4882a593Smuzhiyun	jmp 4f
61*4882a593Smuzhiyun1:	movw (%esi), %bx
62*4882a593Smuzhiyun	addl $2, %esi
63*4882a593Smuzhiyun	addw %bx, %ax
64*4882a593Smuzhiyun	adcl $0, %eax
65*4882a593Smuzhiyun2:
66*4882a593Smuzhiyun	movl %ecx, %edx
67*4882a593Smuzhiyun	shrl $5, %ecx
68*4882a593Smuzhiyun	jz 2f
69*4882a593Smuzhiyun	testl %esi, %esi
70*4882a593Smuzhiyun1:	movl (%esi), %ebx
71*4882a593Smuzhiyun	adcl %ebx, %eax
72*4882a593Smuzhiyun	movl 4(%esi), %ebx
73*4882a593Smuzhiyun	adcl %ebx, %eax
74*4882a593Smuzhiyun	movl 8(%esi), %ebx
75*4882a593Smuzhiyun	adcl %ebx, %eax
76*4882a593Smuzhiyun	movl 12(%esi), %ebx
77*4882a593Smuzhiyun	adcl %ebx, %eax
78*4882a593Smuzhiyun	movl 16(%esi), %ebx
79*4882a593Smuzhiyun	adcl %ebx, %eax
80*4882a593Smuzhiyun	movl 20(%esi), %ebx
81*4882a593Smuzhiyun	adcl %ebx, %eax
82*4882a593Smuzhiyun	movl 24(%esi), %ebx
83*4882a593Smuzhiyun	adcl %ebx, %eax
84*4882a593Smuzhiyun	movl 28(%esi), %ebx
85*4882a593Smuzhiyun	adcl %ebx, %eax
86*4882a593Smuzhiyun	lea 32(%esi), %esi
87*4882a593Smuzhiyun	dec %ecx
88*4882a593Smuzhiyun	jne 1b
89*4882a593Smuzhiyun	adcl $0, %eax
90*4882a593Smuzhiyun2:	movl %edx, %ecx
91*4882a593Smuzhiyun	andl $0x1c, %edx
92*4882a593Smuzhiyun	je 4f
93*4882a593Smuzhiyun	shrl $2, %edx		# This clears CF
94*4882a593Smuzhiyun3:	adcl (%esi), %eax
95*4882a593Smuzhiyun	lea 4(%esi), %esi
96*4882a593Smuzhiyun	dec %edx
97*4882a593Smuzhiyun	jne 3b
98*4882a593Smuzhiyun	adcl $0, %eax
99*4882a593Smuzhiyun4:	andl $3, %ecx
100*4882a593Smuzhiyun	jz 7f
101*4882a593Smuzhiyun	cmpl $2, %ecx
102*4882a593Smuzhiyun	jb 5f
103*4882a593Smuzhiyun	movw (%esi),%cx
104*4882a593Smuzhiyun	leal 2(%esi),%esi
105*4882a593Smuzhiyun	je 6f
106*4882a593Smuzhiyun	shll $16,%ecx
107*4882a593Smuzhiyun5:	movb (%esi),%cl
108*4882a593Smuzhiyun6:	addl %ecx,%eax
109*4882a593Smuzhiyun	adcl $0, %eax
110*4882a593Smuzhiyun7:
111*4882a593Smuzhiyun	popl %ebx
112*4882a593Smuzhiyun	popl %esi
113*4882a593Smuzhiyun	RET
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun#else
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun/* Version for PentiumII/PPro */
118*4882a593Smuzhiyun
119*4882a593Smuzhiyuncsum_partial:
120*4882a593Smuzhiyun	pushl %esi
121*4882a593Smuzhiyun	pushl %ebx
122*4882a593Smuzhiyun	movl 20(%esp),%eax	# Function arg: unsigned int sum
123*4882a593Smuzhiyun	movl 16(%esp),%ecx	# Function arg: int len
124*4882a593Smuzhiyun	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun	testl $2, %esi
127*4882a593Smuzhiyun	jnz 30f
128*4882a593Smuzhiyun10:
129*4882a593Smuzhiyun	movl %ecx, %edx
130*4882a593Smuzhiyun	movl %ecx, %ebx
131*4882a593Smuzhiyun	andl $0x7c, %ebx
132*4882a593Smuzhiyun	shrl $7, %ecx
133*4882a593Smuzhiyun	addl %ebx,%esi
134*4882a593Smuzhiyun	shrl $2, %ebx
135*4882a593Smuzhiyun	negl %ebx
136*4882a593Smuzhiyun	lea 45f(%ebx,%ebx,2), %ebx
137*4882a593Smuzhiyun	testl %esi, %esi
138*4882a593Smuzhiyun	jmp *%ebx
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun	# Handle 2-byte-aligned regions
141*4882a593Smuzhiyun20:	addw (%esi), %ax
142*4882a593Smuzhiyun	lea 2(%esi), %esi
143*4882a593Smuzhiyun	adcl $0, %eax
144*4882a593Smuzhiyun	jmp 10b
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun30:	subl $2, %ecx
147*4882a593Smuzhiyun	ja 20b
148*4882a593Smuzhiyun	je 32f
149*4882a593Smuzhiyun	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
150*4882a593Smuzhiyun	addl %ebx, %eax
151*4882a593Smuzhiyun	adcl $0, %eax
152*4882a593Smuzhiyun	jmp 80f
153*4882a593Smuzhiyun32:
154*4882a593Smuzhiyun	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
155*4882a593Smuzhiyun	adcl $0, %eax
156*4882a593Smuzhiyun	jmp 80f
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun40:
159*4882a593Smuzhiyun	addl -128(%esi), %eax
160*4882a593Smuzhiyun	adcl -124(%esi), %eax
161*4882a593Smuzhiyun	adcl -120(%esi), %eax
162*4882a593Smuzhiyun	adcl -116(%esi), %eax
163*4882a593Smuzhiyun	adcl -112(%esi), %eax
164*4882a593Smuzhiyun	adcl -108(%esi), %eax
165*4882a593Smuzhiyun	adcl -104(%esi), %eax
166*4882a593Smuzhiyun	adcl -100(%esi), %eax
167*4882a593Smuzhiyun	adcl -96(%esi), %eax
168*4882a593Smuzhiyun	adcl -92(%esi), %eax
169*4882a593Smuzhiyun	adcl -88(%esi), %eax
170*4882a593Smuzhiyun	adcl -84(%esi), %eax
171*4882a593Smuzhiyun	adcl -80(%esi), %eax
172*4882a593Smuzhiyun	adcl -76(%esi), %eax
173*4882a593Smuzhiyun	adcl -72(%esi), %eax
174*4882a593Smuzhiyun	adcl -68(%esi), %eax
175*4882a593Smuzhiyun	adcl -64(%esi), %eax
176*4882a593Smuzhiyun	adcl -60(%esi), %eax
177*4882a593Smuzhiyun	adcl -56(%esi), %eax
178*4882a593Smuzhiyun	adcl -52(%esi), %eax
179*4882a593Smuzhiyun	adcl -48(%esi), %eax
180*4882a593Smuzhiyun	adcl -44(%esi), %eax
181*4882a593Smuzhiyun	adcl -40(%esi), %eax
182*4882a593Smuzhiyun	adcl -36(%esi), %eax
183*4882a593Smuzhiyun	adcl -32(%esi), %eax
184*4882a593Smuzhiyun	adcl -28(%esi), %eax
185*4882a593Smuzhiyun	adcl -24(%esi), %eax
186*4882a593Smuzhiyun	adcl -20(%esi), %eax
187*4882a593Smuzhiyun	adcl -16(%esi), %eax
188*4882a593Smuzhiyun	adcl -12(%esi), %eax
189*4882a593Smuzhiyun	adcl -8(%esi), %eax
190*4882a593Smuzhiyun	adcl -4(%esi), %eax
191*4882a593Smuzhiyun45:
192*4882a593Smuzhiyun	lea 128(%esi), %esi
193*4882a593Smuzhiyun	adcl $0, %eax
194*4882a593Smuzhiyun	dec %ecx
195*4882a593Smuzhiyun	jge 40b
196*4882a593Smuzhiyun	movl %edx, %ecx
197*4882a593Smuzhiyun50:	andl $3, %ecx
198*4882a593Smuzhiyun	jz 80f
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun	# Handle the last 1-3 bytes without jumping
201*4882a593Smuzhiyun	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
202*4882a593Smuzhiyun	movl $0xffffff,%ebx	# by the shll and shrl instructions
203*4882a593Smuzhiyun	shll $3,%ecx
204*4882a593Smuzhiyun	shrl %cl,%ebx
205*4882a593Smuzhiyun	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
206*4882a593Smuzhiyun	addl %ebx,%eax
207*4882a593Smuzhiyun	adcl $0,%eax
208*4882a593Smuzhiyun80:
209*4882a593Smuzhiyun	popl %ebx
210*4882a593Smuzhiyun	popl %esi
211*4882a593Smuzhiyun	RET
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun#endif
214*4882a593Smuzhiyun	EXPORT_SYMBOL(csum_partial)
215