1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-csum_ipv6_magic.S 4*4882a593Smuzhiyun * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * unsigned short csum_ipv6_magic(struct in6_addr *saddr, 7*4882a593Smuzhiyun * struct in6_addr *daddr, 8*4882a593Smuzhiyun * __u32 len, 9*4882a593Smuzhiyun * unsigned short proto, 10*4882a593Smuzhiyun * unsigned int csum); 11*4882a593Smuzhiyun * 12*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from: 13*4882a593Smuzhiyun * Compiler Writer's Guide for the Alpha 21264 14*4882a593Smuzhiyun * abbreviated as 'CWG' in other comments here 15*4882a593Smuzhiyun * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 16*4882a593Smuzhiyun * Scheduling notation: 17*4882a593Smuzhiyun * E - either cluster 18*4882a593Smuzhiyun * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 19*4882a593Smuzhiyun * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 20*4882a593Smuzhiyun * Try not to change the actual algorithm if possible for consistency. 21*4882a593Smuzhiyun * Determining actual stalls (other than slotting) doesn't appear to be easy to do. 22*4882a593Smuzhiyun * 23*4882a593Smuzhiyun * unsigned short csum_ipv6_magic(struct in6_addr *saddr, 24*4882a593Smuzhiyun * struct in6_addr *daddr, 25*4882a593Smuzhiyun * __u32 len, 26*4882a593Smuzhiyun * unsigned short proto, 27*4882a593Smuzhiyun * unsigned int csum); 28*4882a593Smuzhiyun * 29*4882a593Smuzhiyun * Swap <proto> (takes form 0xaabb) 30*4882a593Smuzhiyun * Then shift it left by 48, so result is: 31*4882a593Smuzhiyun * 0xbbaa0000 00000000 32*4882a593Smuzhiyun * Then turn it back into a sign extended 32-bit item 33*4882a593Smuzhiyun * 0xbbaa0000 34*4882a593Smuzhiyun * 35*4882a593Smuzhiyun * Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence 36*4882a593Smuzhiyun * (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence) 37*4882a593Smuzhiyun * Assume input takes form 0xAABBCCDD 38*4882a593Smuzhiyun * 39*4882a593Smuzhiyun * Finally, original 'folding' approach is to split the long into 4 unsigned shorts 40*4882a593Smuzhiyun * add 4 ushorts, resulting in ushort/carry 41*4882a593Smuzhiyun * add carry bits + ushort --> ushort 42*4882a593Smuzhiyun * add carry bits + ushort --> ushort (in case the carry results in an overflow) 43*4882a593Smuzhiyun * Truncate to a ushort. (took 13 instructions) 44*4882a593Smuzhiyun * From doing some testing, using the approach in checksum.c:from64to16() 45*4882a593Smuzhiyun * results in the same outcome: 46*4882a593Smuzhiyun * split into 2 uints, add those, generating a ulong 47*4882a593Smuzhiyun * add the 3 low ushorts together, generating a uint 48*4882a593Smuzhiyun * a final add of the 2 lower ushorts 49*4882a593Smuzhiyun * truncating the result. 50*4882a593Smuzhiyun * 51*4882a593Smuzhiyun * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> 52*4882a593Smuzhiyun * The cost is 16 instructions (~8 cycles), including two extra loads which 53*4882a593Smuzhiyun * may cause additional delay in rare cases (load-load replay traps). 54*4882a593Smuzhiyun */ 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun#include <asm/export.h> 57*4882a593Smuzhiyun .globl csum_ipv6_magic 58*4882a593Smuzhiyun .align 4 59*4882a593Smuzhiyun .ent csum_ipv6_magic 60*4882a593Smuzhiyun .frame $30,0,$26,0 61*4882a593Smuzhiyuncsum_ipv6_magic: 62*4882a593Smuzhiyun .prologue 0 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun ldq_u $0,0($16) # L : Latency: 3 65*4882a593Smuzhiyun inslh $18,7,$4 # U : 0000000000AABBCC 66*4882a593Smuzhiyun ldq_u $1,8($16) # L : Latency: 3 67*4882a593Smuzhiyun sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun and $16,7,$6 # E : src misalignment 70*4882a593Smuzhiyun ldq_u $5,15($16) # L : Latency: 3 71*4882a593Smuzhiyun zapnot $20,15,$20 # U : zero extend incoming csum 72*4882a593Smuzhiyun ldq_u $2,0($17) # L : U L U L : Latency: 3 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun extql $0,$6,$0 # U : 75*4882a593Smuzhiyun extqh $1,$6,$22 # U : 76*4882a593Smuzhiyun ldq_u $3,8($17) # L : Latency: 3 77*4882a593Smuzhiyun sll $19,24,$19 # U : U U L U : 0x000000aa bb000000 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun cmoveq $6,$31,$22 # E : src aligned? 80*4882a593Smuzhiyun ldq_u $23,15($17) # L : Latency: 3 81*4882a593Smuzhiyun inswl $18,3,$18 # U : 000000CCDD000000 82*4882a593Smuzhiyun addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun or $0,$22,$0 # E : 1st src word complete 85*4882a593Smuzhiyun extql $1,$6,$1 # U : 86*4882a593Smuzhiyun or $18,$4,$18 # E : 000000CCDDAABBCC 87*4882a593Smuzhiyun extqh $5,$6,$5 # U : L U L U 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun and $17,7,$6 # E : dst misalignment 90*4882a593Smuzhiyun extql $2,$6,$2 # U : 91*4882a593Smuzhiyun or $1,$5,$1 # E : 2nd src word complete 92*4882a593Smuzhiyun extqh $3,$6,$22 # U : L U L U : 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun cmoveq $6,$31,$22 # E : dst aligned? 95*4882a593Smuzhiyun extql $3,$6,$3 # U : 96*4882a593Smuzhiyun addq $20,$0,$20 # E : begin summing the words 97*4882a593Smuzhiyun extqh $23,$6,$23 # U : L U L U : 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun srl $18,16,$4 # U : 0000000000CCDDAA 100*4882a593Smuzhiyun or $2,$22,$2 # E : 1st dst word complete 101*4882a593Smuzhiyun zap $19,0x3,$19 # U : <sign bits>bbaa0000 102*4882a593Smuzhiyun or $3,$23,$3 # E : U L U L : 2nd dst word complete 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun cmpult $20,$0,$0 # E : 105*4882a593Smuzhiyun addq $20,$1,$20 # E : 106*4882a593Smuzhiyun zapnot $18,0xa,$18 # U : 00000000DD00BB00 107*4882a593Smuzhiyun zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun or $18,$4,$18 # E : 00000000DDCCBBAA 110*4882a593Smuzhiyun nop # E : 111*4882a593Smuzhiyun cmpult $20,$1,$1 # E : 112*4882a593Smuzhiyun addq $20,$2,$20 # E : U L U L 113*4882a593Smuzhiyun 114*4882a593Smuzhiyun cmpult $20,$2,$2 # E : 115*4882a593Smuzhiyun addq $20,$3,$20 # E : 116*4882a593Smuzhiyun cmpult $20,$3,$3 # E : (1 cycle stall on $20) 117*4882a593Smuzhiyun addq $20,$18,$20 # E : U L U L (1 cycle stall on $20) 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun cmpult $20,$18,$18 # E : 120*4882a593Smuzhiyun addq $20,$19,$20 # E : (1 cycle stall on $20) 121*4882a593Smuzhiyun addq $0,$1,$0 # E : merge the carries back into the csum 122*4882a593Smuzhiyun addq $2,$3,$2 # E : 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun cmpult $20,$19,$19 # E : 125*4882a593Smuzhiyun addq $18,$19,$18 # E : (1 cycle stall on $19) 126*4882a593Smuzhiyun addq $0,$2,$0 # E : 127*4882a593Smuzhiyun addq $20,$18,$20 # E : U L U L : 128*4882a593Smuzhiyun /* (1 cycle stall on $18, 2 cycles on $20) */ 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun addq $0,$20,$0 # E : 131*4882a593Smuzhiyun zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0) 132*4882a593Smuzhiyun nop # E : 133*4882a593Smuzhiyun srl $0,32,$0 # U : U L U L : (1 cycle stall on $0) 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun addq $1,$0,$1 # E : Finished generating ulong 136*4882a593Smuzhiyun extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1) 137*4882a593Smuzhiyun zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1) 138*4882a593Smuzhiyun extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1) 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun addq $0,$2,$0 # E 141*4882a593Smuzhiyun addq $0,$1,$3 # E : Finished generating uint 142*4882a593Smuzhiyun /* (1 cycle stall on $0) */ 143*4882a593Smuzhiyun extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3) 144*4882a593Smuzhiyun nop # E : L U L U 145*4882a593Smuzhiyun 146*4882a593Smuzhiyun addq $1,$3,$0 # E : Final carry 147*4882a593Smuzhiyun not $0,$4 # E : complement (1 cycle stall on $0) 148*4882a593Smuzhiyun zapnot $4,3,$0 # U : clear upper garbage bits 149*4882a593Smuzhiyun /* (1 cycle stall on $4) */ 150*4882a593Smuzhiyun ret # L0 : L U L U 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun .end csum_ipv6_magic 153*4882a593Smuzhiyun EXPORT_SYMBOL(csum_ipv6_magic) 154