xref: /OK3568_Linux_fs/kernel/arch/powerpc/lib/checksum_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * This file contains assembly-language implementations
4*4882a593Smuzhiyun * of IP-style 1's complement checksum routines.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun#include <linux/sys.h>
12*4882a593Smuzhiyun#include <asm/processor.h>
13*4882a593Smuzhiyun#include <asm/errno.h>
14*4882a593Smuzhiyun#include <asm/ppc_asm.h>
15*4882a593Smuzhiyun#include <asm/export.h>
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun/*
18*4882a593Smuzhiyun * Computes the checksum of a memory block at buff, length len,
19*4882a593Smuzhiyun * and adds in "sum" (32-bit).
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * __csum_partial(r3=buff, r4=len, r5=sum)
22*4882a593Smuzhiyun */
23*4882a593Smuzhiyun_GLOBAL(__csum_partial)
24*4882a593Smuzhiyun	addic	r0,r5,0			/* clear carry */
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun	srdi.	r6,r4,3			/* less than 8 bytes? */
27*4882a593Smuzhiyun	beq	.Lcsum_tail_word
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun	/*
30*4882a593Smuzhiyun	 * If only halfword aligned, align to a double word. Since odd
31*4882a593Smuzhiyun	 * aligned addresses should be rare and they would require more
32*4882a593Smuzhiyun	 * work to calculate the correct checksum, we ignore that case
33*4882a593Smuzhiyun	 * and take the potential slowdown of unaligned loads.
34*4882a593Smuzhiyun	 */
35*4882a593Smuzhiyun	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
36*4882a593Smuzhiyun	beq	.Lcsum_aligned
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun	li	r7,4
39*4882a593Smuzhiyun	sub	r6,r7,r6
40*4882a593Smuzhiyun	mtctr	r6
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun1:
43*4882a593Smuzhiyun	lhz	r6,0(r3)		/* align to doubleword */
44*4882a593Smuzhiyun	subi	r4,r4,2
45*4882a593Smuzhiyun	addi	r3,r3,2
46*4882a593Smuzhiyun	adde	r0,r0,r6
47*4882a593Smuzhiyun	bdnz	1b
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun.Lcsum_aligned:
50*4882a593Smuzhiyun	/*
51*4882a593Smuzhiyun	 * We unroll the loop such that each iteration is 64 bytes with an
52*4882a593Smuzhiyun	 * entry and exit limb of 64 bytes, meaning a minimum size of
53*4882a593Smuzhiyun	 * 128 bytes.
54*4882a593Smuzhiyun	 */
55*4882a593Smuzhiyun	srdi.	r6,r4,7
56*4882a593Smuzhiyun	beq	.Lcsum_tail_doublewords		/* len < 128 */
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun	srdi	r6,r4,6
59*4882a593Smuzhiyun	subi	r6,r6,1
60*4882a593Smuzhiyun	mtctr	r6
61*4882a593Smuzhiyun
62*4882a593Smuzhiyun	stdu	r1,-STACKFRAMESIZE(r1)
63*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
64*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
65*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun	ld	r6,0(r3)
68*4882a593Smuzhiyun	ld	r9,8(r3)
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	ld	r10,16(r3)
71*4882a593Smuzhiyun	ld	r11,24(r3)
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun	/*
74*4882a593Smuzhiyun	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75*4882a593Smuzhiyun	 * because of the XER dependency. This means the fastest this loop can
76*4882a593Smuzhiyun	 * go is 16 cycles per iteration. The scheduling of the loop below has
77*4882a593Smuzhiyun	 * been shown to hit this on both POWER6 and POWER7.
78*4882a593Smuzhiyun	 */
79*4882a593Smuzhiyun	.align 5
80*4882a593Smuzhiyun2:
81*4882a593Smuzhiyun	adde	r0,r0,r6
82*4882a593Smuzhiyun	ld	r12,32(r3)
83*4882a593Smuzhiyun	ld	r14,40(r3)
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun	adde	r0,r0,r9
86*4882a593Smuzhiyun	ld	r15,48(r3)
87*4882a593Smuzhiyun	ld	r16,56(r3)
88*4882a593Smuzhiyun	addi	r3,r3,64
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun	adde	r0,r0,r10
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun	adde	r0,r0,r11
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun	adde	r0,r0,r12
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	adde	r0,r0,r14
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun	adde	r0,r0,r15
99*4882a593Smuzhiyun	ld	r6,0(r3)
100*4882a593Smuzhiyun	ld	r9,8(r3)
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun	adde	r0,r0,r16
103*4882a593Smuzhiyun	ld	r10,16(r3)
104*4882a593Smuzhiyun	ld	r11,24(r3)
105*4882a593Smuzhiyun	bdnz	2b
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun	adde	r0,r0,r6
109*4882a593Smuzhiyun	ld	r12,32(r3)
110*4882a593Smuzhiyun	ld	r14,40(r3)
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun	adde	r0,r0,r9
113*4882a593Smuzhiyun	ld	r15,48(r3)
114*4882a593Smuzhiyun	ld	r16,56(r3)
115*4882a593Smuzhiyun	addi	r3,r3,64
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun	adde	r0,r0,r10
118*4882a593Smuzhiyun	adde	r0,r0,r11
119*4882a593Smuzhiyun	adde	r0,r0,r12
120*4882a593Smuzhiyun	adde	r0,r0,r14
121*4882a593Smuzhiyun	adde	r0,r0,r15
122*4882a593Smuzhiyun	adde	r0,r0,r16
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
125*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
126*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
127*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun	andi.	r4,r4,63
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
132*4882a593Smuzhiyun	srdi.	r6,r4,3
133*4882a593Smuzhiyun	beq	.Lcsum_tail_word
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun	mtctr	r6
136*4882a593Smuzhiyun3:
137*4882a593Smuzhiyun	ld	r6,0(r3)
138*4882a593Smuzhiyun	addi	r3,r3,8
139*4882a593Smuzhiyun	adde	r0,r0,r6
140*4882a593Smuzhiyun	bdnz	3b
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun	andi.	r4,r4,7
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun.Lcsum_tail_word:			/* Up to 7 bytes to go */
145*4882a593Smuzhiyun	srdi.	r6,r4,2
146*4882a593Smuzhiyun	beq	.Lcsum_tail_halfword
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun	lwz	r6,0(r3)
149*4882a593Smuzhiyun	addi	r3,r3,4
150*4882a593Smuzhiyun	adde	r0,r0,r6
151*4882a593Smuzhiyun	subi	r4,r4,4
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
154*4882a593Smuzhiyun	srdi.	r6,r4,1
155*4882a593Smuzhiyun	beq	.Lcsum_tail_byte
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun	lhz	r6,0(r3)
158*4882a593Smuzhiyun	addi	r3,r3,2
159*4882a593Smuzhiyun	adde	r0,r0,r6
160*4882a593Smuzhiyun	subi	r4,r4,2
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun.Lcsum_tail_byte:			/* Up to 1 byte to go */
163*4882a593Smuzhiyun	andi.	r6,r4,1
164*4882a593Smuzhiyun	beq	.Lcsum_finish
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun	lbz	r6,0(r3)
167*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__
168*4882a593Smuzhiyun	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
169*4882a593Smuzhiyun	adde	r0,r0,r9
170*4882a593Smuzhiyun#else
171*4882a593Smuzhiyun	adde	r0,r0,r6
172*4882a593Smuzhiyun#endif
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun.Lcsum_finish:
175*4882a593Smuzhiyun	addze	r0,r0			/* add in final carry */
176*4882a593Smuzhiyun	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177*4882a593Smuzhiyun	add	r3,r4,r0
178*4882a593Smuzhiyun	srdi	r3,r3,32
179*4882a593Smuzhiyun	blr
180*4882a593SmuzhiyunEXPORT_SYMBOL(__csum_partial)
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun	.macro srcnr
184*4882a593Smuzhiyun100:
185*4882a593Smuzhiyun	EX_TABLE(100b,.Lerror_nr)
186*4882a593Smuzhiyun	.endm
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	.macro source
189*4882a593Smuzhiyun150:
190*4882a593Smuzhiyun	EX_TABLE(150b,.Lerror)
191*4882a593Smuzhiyun	.endm
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	.macro dstnr
194*4882a593Smuzhiyun200:
195*4882a593Smuzhiyun	EX_TABLE(200b,.Lerror_nr)
196*4882a593Smuzhiyun	.endm
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun	.macro dest
199*4882a593Smuzhiyun250:
200*4882a593Smuzhiyun	EX_TABLE(250b,.Lerror)
201*4882a593Smuzhiyun	.endm
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun/*
204*4882a593Smuzhiyun * Computes the checksum of a memory block at src, length len,
205*4882a593Smuzhiyun * and adds in 0xffffffff (32-bit), while copying the block to dst.
206*4882a593Smuzhiyun * If an access exception occurs, it returns 0.
207*4882a593Smuzhiyun *
208*4882a593Smuzhiyun * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209*4882a593Smuzhiyun */
210*4882a593Smuzhiyun_GLOBAL(csum_partial_copy_generic)
211*4882a593Smuzhiyun	li	r6,-1
212*4882a593Smuzhiyun	addic	r0,r6,0			/* clear carry */
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun	srdi.	r6,r5,3			/* less than 8 bytes? */
215*4882a593Smuzhiyun	beq	.Lcopy_tail_word
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun	/*
218*4882a593Smuzhiyun	 * If only halfword aligned, align to a double word. Since odd
219*4882a593Smuzhiyun	 * aligned addresses should be rare and they would require more
220*4882a593Smuzhiyun	 * work to calculate the correct checksum, we ignore that case
221*4882a593Smuzhiyun	 * and take the potential slowdown of unaligned loads.
222*4882a593Smuzhiyun	 *
223*4882a593Smuzhiyun	 * If the source and destination are relatively unaligned we only
224*4882a593Smuzhiyun	 * align the source. This keeps things simple.
225*4882a593Smuzhiyun	 */
226*4882a593Smuzhiyun	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
227*4882a593Smuzhiyun	beq	.Lcopy_aligned
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun	li	r9,4
230*4882a593Smuzhiyun	sub	r6,r9,r6
231*4882a593Smuzhiyun	mtctr	r6
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun1:
234*4882a593Smuzhiyunsrcnr;	lhz	r6,0(r3)		/* align to doubleword */
235*4882a593Smuzhiyun	subi	r5,r5,2
236*4882a593Smuzhiyun	addi	r3,r3,2
237*4882a593Smuzhiyun	adde	r0,r0,r6
238*4882a593Smuzhiyundstnr;	sth	r6,0(r4)
239*4882a593Smuzhiyun	addi	r4,r4,2
240*4882a593Smuzhiyun	bdnz	1b
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun.Lcopy_aligned:
243*4882a593Smuzhiyun	/*
244*4882a593Smuzhiyun	 * We unroll the loop such that each iteration is 64 bytes with an
245*4882a593Smuzhiyun	 * entry and exit limb of 64 bytes, meaning a minimum size of
246*4882a593Smuzhiyun	 * 128 bytes.
247*4882a593Smuzhiyun	 */
248*4882a593Smuzhiyun	srdi.	r6,r5,7
249*4882a593Smuzhiyun	beq	.Lcopy_tail_doublewords		/* len < 128 */
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun	srdi	r6,r5,6
252*4882a593Smuzhiyun	subi	r6,r6,1
253*4882a593Smuzhiyun	mtctr	r6
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun	stdu	r1,-STACKFRAMESIZE(r1)
256*4882a593Smuzhiyun	std	r14,STK_REG(R14)(r1)
257*4882a593Smuzhiyun	std	r15,STK_REG(R15)(r1)
258*4882a593Smuzhiyun	std	r16,STK_REG(R16)(r1)
259*4882a593Smuzhiyun
260*4882a593Smuzhiyunsource;	ld	r6,0(r3)
261*4882a593Smuzhiyunsource;	ld	r9,8(r3)
262*4882a593Smuzhiyun
263*4882a593Smuzhiyunsource;	ld	r10,16(r3)
264*4882a593Smuzhiyunsource;	ld	r11,24(r3)
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun	/*
267*4882a593Smuzhiyun	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268*4882a593Smuzhiyun	 * because of the XER dependency. This means the fastest this loop can
269*4882a593Smuzhiyun	 * go is 16 cycles per iteration. The scheduling of the loop below has
270*4882a593Smuzhiyun	 * been shown to hit this on both POWER6 and POWER7.
271*4882a593Smuzhiyun	 */
272*4882a593Smuzhiyun	.align 5
273*4882a593Smuzhiyun2:
274*4882a593Smuzhiyun	adde	r0,r0,r6
275*4882a593Smuzhiyunsource;	ld	r12,32(r3)
276*4882a593Smuzhiyunsource;	ld	r14,40(r3)
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun	adde	r0,r0,r9
279*4882a593Smuzhiyunsource;	ld	r15,48(r3)
280*4882a593Smuzhiyunsource;	ld	r16,56(r3)
281*4882a593Smuzhiyun	addi	r3,r3,64
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun	adde	r0,r0,r10
284*4882a593Smuzhiyundest;	std	r6,0(r4)
285*4882a593Smuzhiyundest;	std	r9,8(r4)
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun	adde	r0,r0,r11
288*4882a593Smuzhiyundest;	std	r10,16(r4)
289*4882a593Smuzhiyundest;	std	r11,24(r4)
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun	adde	r0,r0,r12
292*4882a593Smuzhiyundest;	std	r12,32(r4)
293*4882a593Smuzhiyundest;	std	r14,40(r4)
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun	adde	r0,r0,r14
296*4882a593Smuzhiyundest;	std	r15,48(r4)
297*4882a593Smuzhiyundest;	std	r16,56(r4)
298*4882a593Smuzhiyun	addi	r4,r4,64
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun	adde	r0,r0,r15
301*4882a593Smuzhiyunsource;	ld	r6,0(r3)
302*4882a593Smuzhiyunsource;	ld	r9,8(r3)
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun	adde	r0,r0,r16
305*4882a593Smuzhiyunsource;	ld	r10,16(r3)
306*4882a593Smuzhiyunsource;	ld	r11,24(r3)
307*4882a593Smuzhiyun	bdnz	2b
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun	adde	r0,r0,r6
311*4882a593Smuzhiyunsource;	ld	r12,32(r3)
312*4882a593Smuzhiyunsource;	ld	r14,40(r3)
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun	adde	r0,r0,r9
315*4882a593Smuzhiyunsource;	ld	r15,48(r3)
316*4882a593Smuzhiyunsource;	ld	r16,56(r3)
317*4882a593Smuzhiyun	addi	r3,r3,64
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun	adde	r0,r0,r10
320*4882a593Smuzhiyundest;	std	r6,0(r4)
321*4882a593Smuzhiyundest;	std	r9,8(r4)
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun	adde	r0,r0,r11
324*4882a593Smuzhiyundest;	std	r10,16(r4)
325*4882a593Smuzhiyundest;	std	r11,24(r4)
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun	adde	r0,r0,r12
328*4882a593Smuzhiyundest;	std	r12,32(r4)
329*4882a593Smuzhiyundest;	std	r14,40(r4)
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun	adde	r0,r0,r14
332*4882a593Smuzhiyundest;	std	r15,48(r4)
333*4882a593Smuzhiyundest;	std	r16,56(r4)
334*4882a593Smuzhiyun	addi	r4,r4,64
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun	adde	r0,r0,r15
337*4882a593Smuzhiyun	adde	r0,r0,r16
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
340*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
341*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
342*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun	andi.	r5,r5,63
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
347*4882a593Smuzhiyun	srdi.	r6,r5,3
348*4882a593Smuzhiyun	beq	.Lcopy_tail_word
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun	mtctr	r6
351*4882a593Smuzhiyun3:
352*4882a593Smuzhiyunsrcnr;	ld	r6,0(r3)
353*4882a593Smuzhiyun	addi	r3,r3,8
354*4882a593Smuzhiyun	adde	r0,r0,r6
355*4882a593Smuzhiyundstnr;	std	r6,0(r4)
356*4882a593Smuzhiyun	addi	r4,r4,8
357*4882a593Smuzhiyun	bdnz	3b
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun	andi.	r5,r5,7
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun.Lcopy_tail_word:			/* Up to 7 bytes to go */
362*4882a593Smuzhiyun	srdi.	r6,r5,2
363*4882a593Smuzhiyun	beq	.Lcopy_tail_halfword
364*4882a593Smuzhiyun
365*4882a593Smuzhiyunsrcnr;	lwz	r6,0(r3)
366*4882a593Smuzhiyun	addi	r3,r3,4
367*4882a593Smuzhiyun	adde	r0,r0,r6
368*4882a593Smuzhiyundstnr;	stw	r6,0(r4)
369*4882a593Smuzhiyun	addi	r4,r4,4
370*4882a593Smuzhiyun	subi	r5,r5,4
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
373*4882a593Smuzhiyun	srdi.	r6,r5,1
374*4882a593Smuzhiyun	beq	.Lcopy_tail_byte
375*4882a593Smuzhiyun
376*4882a593Smuzhiyunsrcnr;	lhz	r6,0(r3)
377*4882a593Smuzhiyun	addi	r3,r3,2
378*4882a593Smuzhiyun	adde	r0,r0,r6
379*4882a593Smuzhiyundstnr;	sth	r6,0(r4)
380*4882a593Smuzhiyun	addi	r4,r4,2
381*4882a593Smuzhiyun	subi	r5,r5,2
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun.Lcopy_tail_byte:			/* Up to 1 byte to go */
384*4882a593Smuzhiyun	andi.	r6,r5,1
385*4882a593Smuzhiyun	beq	.Lcopy_finish
386*4882a593Smuzhiyun
387*4882a593Smuzhiyunsrcnr;	lbz	r6,0(r3)
388*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__
389*4882a593Smuzhiyun	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
390*4882a593Smuzhiyun	adde	r0,r0,r9
391*4882a593Smuzhiyun#else
392*4882a593Smuzhiyun	adde	r0,r0,r6
393*4882a593Smuzhiyun#endif
394*4882a593Smuzhiyundstnr;	stb	r6,0(r4)
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun.Lcopy_finish:
397*4882a593Smuzhiyun	addze	r0,r0			/* add in final carry */
398*4882a593Smuzhiyun	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
399*4882a593Smuzhiyun	add	r3,r4,r0
400*4882a593Smuzhiyun	srdi	r3,r3,32
401*4882a593Smuzhiyun	blr
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun.Lerror:
404*4882a593Smuzhiyun	ld	r14,STK_REG(R14)(r1)
405*4882a593Smuzhiyun	ld	r15,STK_REG(R15)(r1)
406*4882a593Smuzhiyun	ld	r16,STK_REG(R16)(r1)
407*4882a593Smuzhiyun	addi	r1,r1,STACKFRAMESIZE
408*4882a593Smuzhiyun.Lerror_nr:
409*4882a593Smuzhiyun	li	r3,0
410*4882a593Smuzhiyun	blr
411*4882a593Smuzhiyun
412*4882a593SmuzhiyunEXPORT_SYMBOL(csum_partial_copy_generic)
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun/*
415*4882a593Smuzhiyun * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416*4882a593Smuzhiyun *			   const struct in6_addr *daddr,
417*4882a593Smuzhiyun *			   __u32 len, __u8 proto, __wsum sum)
418*4882a593Smuzhiyun */
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun_GLOBAL(csum_ipv6_magic)
421*4882a593Smuzhiyun	ld	r8, 0(r3)
422*4882a593Smuzhiyun	ld	r9, 8(r3)
423*4882a593Smuzhiyun	add	r5, r5, r6
424*4882a593Smuzhiyun	addc	r0, r8, r9
425*4882a593Smuzhiyun	ld	r10, 0(r4)
426*4882a593Smuzhiyun	ld	r11, 8(r4)
427*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN
428*4882a593Smuzhiyun	rotldi	r5, r5, 8
429*4882a593Smuzhiyun#endif
430*4882a593Smuzhiyun	adde	r0, r0, r10
431*4882a593Smuzhiyun	add	r5, r5, r7
432*4882a593Smuzhiyun	adde	r0, r0, r11
433*4882a593Smuzhiyun	adde	r0, r0, r5
434*4882a593Smuzhiyun	addze	r0, r0
435*4882a593Smuzhiyun	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
436*4882a593Smuzhiyun	add	r3, r0, r3
437*4882a593Smuzhiyun	srdi	r0, r3, 32
438*4882a593Smuzhiyun	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
439*4882a593Smuzhiyun	add	r3, r0, r3
440*4882a593Smuzhiyun	not	r3, r3
441*4882a593Smuzhiyun	rlwinm	r3, r3, 16, 16, 31
442*4882a593Smuzhiyun	blr
443*4882a593SmuzhiyunEXPORT_SYMBOL(csum_ipv6_magic)
444