1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* csum_copy.S: Checksum+copy code for sparc64 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (C) 2005 David S. Miller <davem@davemloft.net> 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun#include <asm/export.h> 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun#ifdef __KERNEL__ 10*4882a593Smuzhiyun#define GLOBAL_SPARE %g7 11*4882a593Smuzhiyun#else 12*4882a593Smuzhiyun#define GLOBAL_SPARE %g5 13*4882a593Smuzhiyun#endif 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun#ifndef EX_LD 16*4882a593Smuzhiyun#define EX_LD(x) x 17*4882a593Smuzhiyun#endif 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun#ifndef EX_ST 20*4882a593Smuzhiyun#define EX_ST(x) x 21*4882a593Smuzhiyun#endif 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun#ifndef EX_RETVAL 24*4882a593Smuzhiyun#define EX_RETVAL(x) x 25*4882a593Smuzhiyun#endif 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun#ifndef LOAD 28*4882a593Smuzhiyun#define LOAD(type,addr,dest) type [addr], dest 29*4882a593Smuzhiyun#endif 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun#ifndef STORE 32*4882a593Smuzhiyun#define STORE(type,src,addr) type src, [addr] 33*4882a593Smuzhiyun#endif 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#ifndef FUNC_NAME 36*4882a593Smuzhiyun#define FUNC_NAME csum_partial_copy_nocheck 37*4882a593Smuzhiyun#endif 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun .register %g2, #scratch 40*4882a593Smuzhiyun .register %g3, #scratch 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun .text 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun90: 45*4882a593Smuzhiyun /* We checked for zero length already, so there must be 46*4882a593Smuzhiyun * at least one byte. 47*4882a593Smuzhiyun */ 48*4882a593Smuzhiyun be,pt %icc, 1f 49*4882a593Smuzhiyun nop 50*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o0 + 0x00, %o4)) 51*4882a593Smuzhiyun add %o0, 1, %o0 52*4882a593Smuzhiyun sub %o2, 1, %o2 53*4882a593Smuzhiyun EX_ST(STORE(stb, %o4, %o1 + 0x00)) 54*4882a593Smuzhiyun add %o1, 1, %o1 55*4882a593Smuzhiyun1: andcc %o0, 0x2, %g0 56*4882a593Smuzhiyun be,pn %icc, 80f 57*4882a593Smuzhiyun cmp %o2, 2 58*4882a593Smuzhiyun blu,pn %icc, 60f 59*4882a593Smuzhiyun nop 60*4882a593Smuzhiyun EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) 61*4882a593Smuzhiyun add %o0, 2, %o0 62*4882a593Smuzhiyun sub %o2, 2, %o2 63*4882a593Smuzhiyun EX_ST(STORE(sth, %o5, %o1 + 0x00)) 64*4882a593Smuzhiyun add %o1, 2, %o1 65*4882a593Smuzhiyun ba,pt %xcc, 80f 66*4882a593Smuzhiyun add %o5, %o4, %o4 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun .globl FUNC_NAME 69*4882a593Smuzhiyun .type FUNC_NAME,#function 70*4882a593Smuzhiyun EXPORT_SYMBOL(FUNC_NAME) 71*4882a593SmuzhiyunFUNC_NAME: /* %o0=src, %o1=dst, %o2=len */ 72*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x000, #n_reads) 73*4882a593Smuzhiyun xor %o0, %o1, %g1 74*4882a593Smuzhiyun mov -1, %o3 75*4882a593Smuzhiyun clr %o4 76*4882a593Smuzhiyun andcc %g1, 0x3, %g0 77*4882a593Smuzhiyun bne,pn %icc, 95f 78*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x040, #n_reads) 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun brz,pn %o2, 70f 81*4882a593Smuzhiyun andcc %o0, 0x3, %g0 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun /* We "remember" whether the lowest bit in the address 84*4882a593Smuzhiyun * was set in GLOBAL_SPARE. Because if it is, we have to swap 85*4882a593Smuzhiyun * upper and lower 8 bit fields of the sum we calculate. 86*4882a593Smuzhiyun */ 87*4882a593Smuzhiyun bne,pn %icc, 90b 88*4882a593Smuzhiyun andcc %o0, 0x1, GLOBAL_SPARE 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun80: 91*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x080, #n_reads) 92*4882a593Smuzhiyun andncc %o2, 0x3f, %g3 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x0c0, #n_reads) 95*4882a593Smuzhiyun sub %o2, %g3, %o2 96*4882a593Smuzhiyun brz,pn %g3, 2f 97*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x100, #n_reads) 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun /* So that we don't need to use the non-pairing 100*4882a593Smuzhiyun * add-with-carry instructions we accumulate 32-bit 101*4882a593Smuzhiyun * values into a 64-bit register. At the end of the 102*4882a593Smuzhiyun * loop we fold it down to 32-bits and so on. 103*4882a593Smuzhiyun */ 104*4882a593Smuzhiyun ba,pt %xcc, 1f 105*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x140, #n_reads) 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun .align 32 108*4882a593Smuzhiyun1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) 109*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x04, %g1)) 110*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x08, %g2)) 111*4882a593Smuzhiyun add %o4, %o5, %o4 112*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x00)) 113*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x0c, %o5)) 114*4882a593Smuzhiyun add %o4, %g1, %o4 115*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + 0x04)) 116*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x10, %g1)) 117*4882a593Smuzhiyun add %o4, %g2, %o4 118*4882a593Smuzhiyun EX_ST(STORE(stw, %g2, %o1 + 0x08)) 119*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x14, %g2)) 120*4882a593Smuzhiyun add %o4, %o5, %o4 121*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x0c)) 122*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x18, %o5)) 123*4882a593Smuzhiyun add %o4, %g1, %o4 124*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + 0x10)) 125*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x1c, %g1)) 126*4882a593Smuzhiyun add %o4, %g2, %o4 127*4882a593Smuzhiyun EX_ST(STORE(stw, %g2, %o1 + 0x14)) 128*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x20, %g2)) 129*4882a593Smuzhiyun add %o4, %o5, %o4 130*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x18)) 131*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x24, %o5)) 132*4882a593Smuzhiyun add %o4, %g1, %o4 133*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + 0x1c)) 134*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x28, %g1)) 135*4882a593Smuzhiyun add %o4, %g2, %o4 136*4882a593Smuzhiyun EX_ST(STORE(stw, %g2, %o1 + 0x20)) 137*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x2c, %g2)) 138*4882a593Smuzhiyun add %o4, %o5, %o4 139*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x24)) 140*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x30, %o5)) 141*4882a593Smuzhiyun add %o4, %g1, %o4 142*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + 0x28)) 143*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x34, %g1)) 144*4882a593Smuzhiyun add %o4, %g2, %o4 145*4882a593Smuzhiyun EX_ST(STORE(stw, %g2, %o1 + 0x2c)) 146*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x38, %g2)) 147*4882a593Smuzhiyun add %o4, %o5, %o4 148*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x30)) 149*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0 + 0x3c, %o5)) 150*4882a593Smuzhiyun add %o4, %g1, %o4 151*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + 0x34)) 152*4882a593Smuzhiyun LOAD(prefetch, %o0 + 0x180, #n_reads) 153*4882a593Smuzhiyun add %o4, %g2, %o4 154*4882a593Smuzhiyun EX_ST(STORE(stw, %g2, %o1 + 0x38)) 155*4882a593Smuzhiyun subcc %g3, 0x40, %g3 156*4882a593Smuzhiyun add %o0, 0x40, %o0 157*4882a593Smuzhiyun add %o4, %o5, %o4 158*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x3c)) 159*4882a593Smuzhiyun bne,pt %icc, 1b 160*4882a593Smuzhiyun add %o1, 0x40, %o1 161*4882a593Smuzhiyun 162*4882a593Smuzhiyun2: and %o2, 0x3c, %g3 163*4882a593Smuzhiyun brz,pn %g3, 2f 164*4882a593Smuzhiyun sub %o2, %g3, %o2 165*4882a593Smuzhiyun1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) 166*4882a593Smuzhiyun subcc %g3, 0x4, %g3 167*4882a593Smuzhiyun add %o0, 0x4, %o0 168*4882a593Smuzhiyun add %o4, %o5, %o4 169*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + 0x00)) 170*4882a593Smuzhiyun bne,pt %icc, 1b 171*4882a593Smuzhiyun add %o1, 0x4, %o1 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun2: 174*4882a593Smuzhiyun /* fold 64-->32 */ 175*4882a593Smuzhiyun srlx %o4, 32, %o5 176*4882a593Smuzhiyun srl %o4, 0, %o4 177*4882a593Smuzhiyun add %o4, %o5, %o4 178*4882a593Smuzhiyun srlx %o4, 32, %o5 179*4882a593Smuzhiyun srl %o4, 0, %o4 180*4882a593Smuzhiyun add %o4, %o5, %o4 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun /* fold 32-->16 */ 183*4882a593Smuzhiyun sethi %hi(0xffff0000), %g1 184*4882a593Smuzhiyun srl %o4, 16, %o5 185*4882a593Smuzhiyun andn %o4, %g1, %g2 186*4882a593Smuzhiyun add %o5, %g2, %o4 187*4882a593Smuzhiyun srl %o4, 16, %o5 188*4882a593Smuzhiyun andn %o4, %g1, %g2 189*4882a593Smuzhiyun add %o5, %g2, %o4 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun60: 192*4882a593Smuzhiyun /* %o4 has the 16-bit sum we have calculated so-far. */ 193*4882a593Smuzhiyun cmp %o2, 2 194*4882a593Smuzhiyun blu,pt %icc, 1f 195*4882a593Smuzhiyun nop 196*4882a593Smuzhiyun EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) 197*4882a593Smuzhiyun sub %o2, 2, %o2 198*4882a593Smuzhiyun add %o0, 2, %o0 199*4882a593Smuzhiyun add %o4, %o5, %o4 200*4882a593Smuzhiyun EX_ST(STORE(sth, %o5, %o1 + 0x00)) 201*4882a593Smuzhiyun add %o1, 0x2, %o1 202*4882a593Smuzhiyun1: brz,pt %o2, 1f 203*4882a593Smuzhiyun nop 204*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o0 + 0x00, %o5)) 205*4882a593Smuzhiyun sub %o2, 1, %o2 206*4882a593Smuzhiyun add %o0, 1, %o0 207*4882a593Smuzhiyun EX_ST(STORE(stb, %o5, %o1 + 0x00)) 208*4882a593Smuzhiyun sllx %o5, 8, %o5 209*4882a593Smuzhiyun add %o1, 1, %o1 210*4882a593Smuzhiyun add %o4, %o5, %o4 211*4882a593Smuzhiyun1: 212*4882a593Smuzhiyun /* fold 32-->16 */ 213*4882a593Smuzhiyun sethi %hi(0xffff0000), %g1 214*4882a593Smuzhiyun srl %o4, 16, %o5 215*4882a593Smuzhiyun andn %o4, %g1, %g2 216*4882a593Smuzhiyun add %o5, %g2, %o4 217*4882a593Smuzhiyun srl %o4, 16, %o5 218*4882a593Smuzhiyun andn %o4, %g1, %g2 219*4882a593Smuzhiyun add %o5, %g2, %o4 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun1: brz,pt GLOBAL_SPARE, 1f 222*4882a593Smuzhiyun nop 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun /* We started with an odd byte, byte-swap the result. */ 225*4882a593Smuzhiyun srl %o4, 8, %o5 226*4882a593Smuzhiyun and %o4, 0xff, %g1 227*4882a593Smuzhiyun sll %g1, 8, %g1 228*4882a593Smuzhiyun or %o5, %g1, %o4 229*4882a593Smuzhiyun 230*4882a593Smuzhiyun1: addcc %o3, %o4, %o3 231*4882a593Smuzhiyun addc %g0, %o3, %o3 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun70: 234*4882a593Smuzhiyun retl 235*4882a593Smuzhiyun srl %o3, 0, %o0 236*4882a593Smuzhiyun 237*4882a593Smuzhiyun95: mov 0, GLOBAL_SPARE 238*4882a593Smuzhiyun brlez,pn %o2, 4f 239*4882a593Smuzhiyun andcc %o0, 1, %o5 240*4882a593Smuzhiyun be,a,pt %icc, 1f 241*4882a593Smuzhiyun srl %o2, 1, %g1 242*4882a593Smuzhiyun sub %o2, 1, %o2 243*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE)) 244*4882a593Smuzhiyun add %o0, 1, %o0 245*4882a593Smuzhiyun EX_ST(STORE(stb, GLOBAL_SPARE, %o1)) 246*4882a593Smuzhiyun srl %o2, 1, %g1 247*4882a593Smuzhiyun add %o1, 1, %o1 248*4882a593Smuzhiyun1: brz,a,pn %g1, 3f 249*4882a593Smuzhiyun andcc %o2, 1, %g0 250*4882a593Smuzhiyun andcc %o0, 2, %g0 251*4882a593Smuzhiyun be,a,pt %icc, 1f 252*4882a593Smuzhiyun srl %g1, 1, %g1 253*4882a593Smuzhiyun EX_LD(LOAD(lduh, %o0, %o4)) 254*4882a593Smuzhiyun sub %o2, 2, %o2 255*4882a593Smuzhiyun srl %o4, 8, %g2 256*4882a593Smuzhiyun sub %g1, 1, %g1 257*4882a593Smuzhiyun EX_ST(STORE(stb, %g2, %o1)) 258*4882a593Smuzhiyun add %o4, GLOBAL_SPARE, GLOBAL_SPARE 259*4882a593Smuzhiyun EX_ST(STORE(stb, %o4, %o1 + 1)) 260*4882a593Smuzhiyun add %o0, 2, %o0 261*4882a593Smuzhiyun srl %g1, 1, %g1 262*4882a593Smuzhiyun add %o1, 2, %o1 263*4882a593Smuzhiyun1: brz,a,pn %g1, 2f 264*4882a593Smuzhiyun andcc %o2, 2, %g0 265*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0, %o4)) 266*4882a593Smuzhiyun5: srl %o4, 24, %g2 267*4882a593Smuzhiyun srl %o4, 16, %g3 268*4882a593Smuzhiyun EX_ST(STORE(stb, %g2, %o1)) 269*4882a593Smuzhiyun srl %o4, 8, %g2 270*4882a593Smuzhiyun EX_ST(STORE(stb, %g3, %o1 + 1)) 271*4882a593Smuzhiyun add %o0, 4, %o0 272*4882a593Smuzhiyun EX_ST(STORE(stb, %g2, %o1 + 2)) 273*4882a593Smuzhiyun addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE 274*4882a593Smuzhiyun EX_ST(STORE(stb, %o4, %o1 + 3)) 275*4882a593Smuzhiyun addc GLOBAL_SPARE, %g0, GLOBAL_SPARE 276*4882a593Smuzhiyun add %o1, 4, %o1 277*4882a593Smuzhiyun subcc %g1, 1, %g1 278*4882a593Smuzhiyun bne,a,pt %icc, 5b 279*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o0, %o4)) 280*4882a593Smuzhiyun sll GLOBAL_SPARE, 16, %g2 281*4882a593Smuzhiyun srl GLOBAL_SPARE, 16, GLOBAL_SPARE 282*4882a593Smuzhiyun srl %g2, 16, %g2 283*4882a593Smuzhiyun andcc %o2, 2, %g0 284*4882a593Smuzhiyun add %g2, GLOBAL_SPARE, GLOBAL_SPARE 285*4882a593Smuzhiyun2: be,a,pt %icc, 3f 286*4882a593Smuzhiyun andcc %o2, 1, %g0 287*4882a593Smuzhiyun EX_LD(LOAD(lduh, %o0, %o4)) 288*4882a593Smuzhiyun andcc %o2, 1, %g0 289*4882a593Smuzhiyun srl %o4, 8, %g2 290*4882a593Smuzhiyun add %o0, 2, %o0 291*4882a593Smuzhiyun EX_ST(STORE(stb, %g2, %o1)) 292*4882a593Smuzhiyun add GLOBAL_SPARE, %o4, GLOBAL_SPARE 293*4882a593Smuzhiyun EX_ST(STORE(stb, %o4, %o1 + 1)) 294*4882a593Smuzhiyun add %o1, 2, %o1 295*4882a593Smuzhiyun3: be,a,pt %icc, 1f 296*4882a593Smuzhiyun sll GLOBAL_SPARE, 16, %o4 297*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o0, %g2)) 298*4882a593Smuzhiyun sll %g2, 8, %o4 299*4882a593Smuzhiyun EX_ST(STORE(stb, %g2, %o1)) 300*4882a593Smuzhiyun add GLOBAL_SPARE, %o4, GLOBAL_SPARE 301*4882a593Smuzhiyun sll GLOBAL_SPARE, 16, %o4 302*4882a593Smuzhiyun1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE 303*4882a593Smuzhiyun srl GLOBAL_SPARE, 16, %o4 304*4882a593Smuzhiyun addc %g0, %o4, GLOBAL_SPARE 305*4882a593Smuzhiyun brz,pt %o5, 4f 306*4882a593Smuzhiyun srl GLOBAL_SPARE, 8, %o4 307*4882a593Smuzhiyun and GLOBAL_SPARE, 0xff, %g2 308*4882a593Smuzhiyun and %o4, 0xff, %o4 309*4882a593Smuzhiyun sll %g2, 8, %g2 310*4882a593Smuzhiyun or %g2, %o4, GLOBAL_SPARE 311*4882a593Smuzhiyun4: addcc %o3, GLOBAL_SPARE, %o3 312*4882a593Smuzhiyun addc %g0, %o3, %o0 313*4882a593Smuzhiyun retl 314*4882a593Smuzhiyun srl %o0, 0, %o0 315*4882a593Smuzhiyun .size FUNC_NAME, .-FUNC_NAME 316