1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Optmized version of the ip_fast_csum() function 4*4882a593Smuzhiyun * Used for calculating IP header checksum 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * Return: 16bit checksum, complemented 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * Inputs: 9*4882a593Smuzhiyun * in0: address of buffer to checksum (char *) 10*4882a593Smuzhiyun * in1: length of the buffer (int) 11*4882a593Smuzhiyun * 12*4882a593Smuzhiyun * Copyright (C) 2002, 2006 Intel Corp. 13*4882a593Smuzhiyun * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com> 14*4882a593Smuzhiyun */ 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun#include <asm/asmmacro.h> 17*4882a593Smuzhiyun#include <asm/export.h> 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun/* 20*4882a593Smuzhiyun * Since we know that most likely this function is called with buf aligned 21*4882a593Smuzhiyun * on 4-byte boundary and 20 bytes in length, we can execution rather quickly 22*4882a593Smuzhiyun * versus calling generic version of do_csum, which has lots of overhead in 23*4882a593Smuzhiyun * handling various alignments and sizes. However, due to lack of constrains 24*4882a593Smuzhiyun * put on the function input argument, cases with alignment not on 4-byte or 25*4882a593Smuzhiyun * size not equal to 20 bytes will be handled by the generic do_csum function. 26*4882a593Smuzhiyun */ 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#define in0 r32 29*4882a593Smuzhiyun#define in1 r33 30*4882a593Smuzhiyun#define in2 r34 31*4882a593Smuzhiyun#define in3 r35 32*4882a593Smuzhiyun#define in4 r36 33*4882a593Smuzhiyun#define ret0 r8 34*4882a593Smuzhiyun 35*4882a593SmuzhiyunGLOBAL_ENTRY(ip_fast_csum) 36*4882a593Smuzhiyun .prologue 37*4882a593Smuzhiyun .body 38*4882a593Smuzhiyun cmp.ne p6,p7=5,in1 // size other than 20 byte? 39*4882a593Smuzhiyun and r14=3,in0 // is it aligned on 4-byte? 40*4882a593Smuzhiyun add r15=4,in0 // second source pointer 41*4882a593Smuzhiyun ;; 42*4882a593Smuzhiyun cmp.ne.or.andcm p6,p7=r14,r0 43*4882a593Smuzhiyun ;; 44*4882a593Smuzhiyun(p7) ld4 r20=[in0],8 45*4882a593Smuzhiyun(p7) ld4 r21=[r15],8 46*4882a593Smuzhiyun(p6) br.spnt .generic 47*4882a593Smuzhiyun ;; 48*4882a593Smuzhiyun ld4 r22=[in0],8 49*4882a593Smuzhiyun ld4 r23=[r15],8 50*4882a593Smuzhiyun ;; 51*4882a593Smuzhiyun ld4 r24=[in0] 52*4882a593Smuzhiyun add r20=r20,r21 53*4882a593Smuzhiyun add r22=r22,r23 54*4882a593Smuzhiyun ;; 55*4882a593Smuzhiyun add r20=r20,r22 56*4882a593Smuzhiyun ;; 57*4882a593Smuzhiyun add r20=r20,r24 58*4882a593Smuzhiyun ;; 59*4882a593Smuzhiyun shr.u ret0=r20,16 // now need to add the carry 60*4882a593Smuzhiyun zxt2 r20=r20 61*4882a593Smuzhiyun ;; 62*4882a593Smuzhiyun add r20=ret0,r20 63*4882a593Smuzhiyun ;; 64*4882a593Smuzhiyun shr.u ret0=r20,16 // add carry again 65*4882a593Smuzhiyun zxt2 r20=r20 66*4882a593Smuzhiyun ;; 67*4882a593Smuzhiyun add r20=ret0,r20 68*4882a593Smuzhiyun ;; 69*4882a593Smuzhiyun shr.u ret0=r20,16 70*4882a593Smuzhiyun zxt2 r20=r20 71*4882a593Smuzhiyun ;; 72*4882a593Smuzhiyun add r20=ret0,r20 73*4882a593Smuzhiyun mov r9=0xffff 74*4882a593Smuzhiyun ;; 75*4882a593Smuzhiyun andcm ret0=r9,r20 76*4882a593Smuzhiyun .restore sp // reset frame state 77*4882a593Smuzhiyun br.ret.sptk.many b0 78*4882a593Smuzhiyun ;; 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun.generic: 81*4882a593Smuzhiyun .prologue 82*4882a593Smuzhiyun .save ar.pfs, r35 83*4882a593Smuzhiyun alloc r35=ar.pfs,2,2,2,0 84*4882a593Smuzhiyun .save rp, r34 85*4882a593Smuzhiyun mov r34=b0 86*4882a593Smuzhiyun .body 87*4882a593Smuzhiyun dep.z out1=in1,2,30 88*4882a593Smuzhiyun mov out0=in0 89*4882a593Smuzhiyun ;; 90*4882a593Smuzhiyun br.call.sptk.many b0=do_csum 91*4882a593Smuzhiyun ;; 92*4882a593Smuzhiyun andcm ret0=-1,ret0 93*4882a593Smuzhiyun mov ar.pfs=r35 94*4882a593Smuzhiyun mov b0=r34 95*4882a593Smuzhiyun br.ret.sptk.many b0 96*4882a593SmuzhiyunEND(ip_fast_csum) 97*4882a593SmuzhiyunEXPORT_SYMBOL(ip_fast_csum) 98*4882a593Smuzhiyun 99*4882a593SmuzhiyunGLOBAL_ENTRY(csum_ipv6_magic) 100*4882a593Smuzhiyun ld4 r20=[in0],4 101*4882a593Smuzhiyun ld4 r21=[in1],4 102*4882a593Smuzhiyun zxt4 in2=in2 103*4882a593Smuzhiyun ;; 104*4882a593Smuzhiyun ld4 r22=[in0],4 105*4882a593Smuzhiyun ld4 r23=[in1],4 106*4882a593Smuzhiyun dep r15=in3,in2,32,16 107*4882a593Smuzhiyun ;; 108*4882a593Smuzhiyun ld4 r24=[in0],4 109*4882a593Smuzhiyun ld4 r25=[in1],4 110*4882a593Smuzhiyun mux1 r15=r15,@rev 111*4882a593Smuzhiyun add r16=r20,r21 112*4882a593Smuzhiyun add r17=r22,r23 113*4882a593Smuzhiyun zxt4 in4=in4 114*4882a593Smuzhiyun ;; 115*4882a593Smuzhiyun ld4 r26=[in0],4 116*4882a593Smuzhiyun ld4 r27=[in1],4 117*4882a593Smuzhiyun shr.u r15=r15,16 118*4882a593Smuzhiyun add r18=r24,r25 119*4882a593Smuzhiyun add r8=r16,r17 120*4882a593Smuzhiyun ;; 121*4882a593Smuzhiyun add r19=r26,r27 122*4882a593Smuzhiyun add r8=r8,r18 123*4882a593Smuzhiyun ;; 124*4882a593Smuzhiyun add r8=r8,r19 125*4882a593Smuzhiyun add r15=r15,in4 126*4882a593Smuzhiyun ;; 127*4882a593Smuzhiyun add r8=r8,r15 128*4882a593Smuzhiyun ;; 129*4882a593Smuzhiyun shr.u r10=r8,32 // now fold sum into short 130*4882a593Smuzhiyun zxt4 r11=r8 131*4882a593Smuzhiyun ;; 132*4882a593Smuzhiyun add r8=r10,r11 133*4882a593Smuzhiyun ;; 134*4882a593Smuzhiyun shr.u r10=r8,16 // yeah, keep it rolling 135*4882a593Smuzhiyun zxt2 r11=r8 136*4882a593Smuzhiyun ;; 137*4882a593Smuzhiyun add r8=r10,r11 138*4882a593Smuzhiyun ;; 139*4882a593Smuzhiyun shr.u r10=r8,16 // three times lucky 140*4882a593Smuzhiyun zxt2 r11=r8 141*4882a593Smuzhiyun ;; 142*4882a593Smuzhiyun add r8=r10,r11 143*4882a593Smuzhiyun mov r9=0xffff 144*4882a593Smuzhiyun ;; 145*4882a593Smuzhiyun andcm r8=r9,r8 146*4882a593Smuzhiyun br.ret.sptk.many b0 147*4882a593SmuzhiyunEND(csum_ipv6_magic) 148*4882a593SmuzhiyunEXPORT_SYMBOL(csum_ipv6_magic) 149