1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-memset.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * This is an efficient (and relatively small) implementation of the C library 6*4882a593Smuzhiyun * "memset()" function for the 21264 implementation of Alpha. 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from: 11*4882a593Smuzhiyun * Compiler Writer's Guide for the Alpha 21264 12*4882a593Smuzhiyun * abbreviated as 'CWG' in other comments here 13*4882a593Smuzhiyun * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 14*4882a593Smuzhiyun * Scheduling notation: 15*4882a593Smuzhiyun * E - either cluster 16*4882a593Smuzhiyun * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 17*4882a593Smuzhiyun * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 18*4882a593Smuzhiyun * The algorithm for the leading and trailing quadwords remains the same, 19*4882a593Smuzhiyun * however the loop has been unrolled to enable better memory throughput, 20*4882a593Smuzhiyun * and the code has been replicated for each of the entry points: __memset 21*4882a593Smuzhiyun * and __memset16 to permit better scheduling to eliminate the stalling 22*4882a593Smuzhiyun * encountered during the mask replication. 23*4882a593Smuzhiyun * A future enhancement might be to put in a byte store loop for really 24*4882a593Smuzhiyun * small (say < 32 bytes) memset()s. Whether or not that change would be 25*4882a593Smuzhiyun * a win in the kernel would depend upon the contextual usage. 26*4882a593Smuzhiyun * WARNING: Maintaining this is going to be more work than the above version, 27*4882a593Smuzhiyun * as fixes will need to be made in multiple places. The performance gain 28*4882a593Smuzhiyun * is worth it. 29*4882a593Smuzhiyun */ 30*4882a593Smuzhiyun#include <asm/export.h> 31*4882a593Smuzhiyun .set noat 32*4882a593Smuzhiyun .set noreorder 33*4882a593Smuzhiyun.text 34*4882a593Smuzhiyun .globl memset 35*4882a593Smuzhiyun .globl __memset 36*4882a593Smuzhiyun .globl ___memset 37*4882a593Smuzhiyun .globl __memset16 38*4882a593Smuzhiyun .globl __constant_c_memset 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun .ent ___memset 41*4882a593Smuzhiyun.align 5 42*4882a593Smuzhiyun___memset: 43*4882a593Smuzhiyun .frame $30,0,$26,0 44*4882a593Smuzhiyun .prologue 0 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun /* 47*4882a593Smuzhiyun * Serious stalling happens. The only way to mitigate this is to 48*4882a593Smuzhiyun * undertake a major re-write to interleave the constant materialization 49*4882a593Smuzhiyun * with other parts of the fall-through code. This is important, even 50*4882a593Smuzhiyun * though it makes maintenance tougher. 51*4882a593Smuzhiyun * Do this later. 52*4882a593Smuzhiyun */ 53*4882a593Smuzhiyun and $17,255,$1 # E : 00000000000000ch 54*4882a593Smuzhiyun insbl $17,1,$2 # U : 000000000000ch00 55*4882a593Smuzhiyun bis $16,$16,$0 # E : return value 56*4882a593Smuzhiyun ble $18,end_b # U : zero length requested? 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun addq $18,$16,$6 # E : max address to write to 59*4882a593Smuzhiyun bis $1,$2,$17 # E : 000000000000chch 60*4882a593Smuzhiyun insbl $1,2,$3 # U : 0000000000ch0000 61*4882a593Smuzhiyun insbl $1,3,$4 # U : 00000000ch000000 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun or $3,$4,$3 # E : 00000000chch0000 64*4882a593Smuzhiyun inswl $17,4,$5 # U : 0000chch00000000 65*4882a593Smuzhiyun xor $16,$6,$1 # E : will complete write be within one quadword? 66*4882a593Smuzhiyun inswl $17,6,$2 # U : chch000000000000 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun or $17,$3,$17 # E : 00000000chchchch 69*4882a593Smuzhiyun or $2,$5,$2 # E : chchchch00000000 70*4882a593Smuzhiyun bic $1,7,$1 # E : fit within a single quadword? 71*4882a593Smuzhiyun and $16,7,$3 # E : Target addr misalignment 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun or $17,$2,$17 # E : chchchchchchchch 74*4882a593Smuzhiyun beq $1,within_quad_b # U : 75*4882a593Smuzhiyun nop # E : 76*4882a593Smuzhiyun beq $3,aligned_b # U : target is 0mod8 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun /* 79*4882a593Smuzhiyun * Target address is misaligned, and won't fit within a quadword 80*4882a593Smuzhiyun */ 81*4882a593Smuzhiyun ldq_u $4,0($16) # L : Fetch first partial 82*4882a593Smuzhiyun bis $16,$16,$5 # E : Save the address 83*4882a593Smuzhiyun insql $17,$16,$2 # U : Insert new bytes 84*4882a593Smuzhiyun subq $3,8,$3 # E : Invert (for addressing uses) 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun addq $18,$3,$18 # E : $18 is new count ($3 is negative) 87*4882a593Smuzhiyun mskql $4,$16,$4 # U : clear relevant parts of the quad 88*4882a593Smuzhiyun subq $16,$3,$16 # E : $16 is new aligned destination 89*4882a593Smuzhiyun bis $2,$4,$1 # E : Final bytes 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun nop 92*4882a593Smuzhiyun stq_u $1,0($5) # L : Store result 93*4882a593Smuzhiyun nop 94*4882a593Smuzhiyun nop 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun.align 4 97*4882a593Smuzhiyunaligned_b: 98*4882a593Smuzhiyun /* 99*4882a593Smuzhiyun * We are now guaranteed to be quad aligned, with at least 100*4882a593Smuzhiyun * one partial quad to write. 101*4882a593Smuzhiyun */ 102*4882a593Smuzhiyun 103*4882a593Smuzhiyun sra $18,3,$3 # U : Number of remaining quads to write 104*4882a593Smuzhiyun and $18,7,$18 # E : Number of trailing bytes to write 105*4882a593Smuzhiyun bis $16,$16,$5 # E : Save dest address 106*4882a593Smuzhiyun beq $3,no_quad_b # U : tail stuff only 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun /* 109*4882a593Smuzhiyun * it's worth the effort to unroll this and use wh64 if possible 110*4882a593Smuzhiyun * Lifted a bunch of code from clear_user.S 111*4882a593Smuzhiyun * At this point, entry values are: 112*4882a593Smuzhiyun * $16 Current destination address 113*4882a593Smuzhiyun * $5 A copy of $16 114*4882a593Smuzhiyun * $6 The max quadword address to write to 115*4882a593Smuzhiyun * $18 Number trailer bytes 116*4882a593Smuzhiyun * $3 Number quads to write 117*4882a593Smuzhiyun */ 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 120*4882a593Smuzhiyun subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 121*4882a593Smuzhiyun subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 122*4882a593Smuzhiyun blt $4, loop_b # U : 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun /* 125*4882a593Smuzhiyun * We know we've got at least 16 quads, minimum of one trip 126*4882a593Smuzhiyun * through unrolled loop. Do a quad at a time to get us 0mod64 127*4882a593Smuzhiyun * aligned. 128*4882a593Smuzhiyun */ 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun nop # E : 131*4882a593Smuzhiyun nop # E : 132*4882a593Smuzhiyun nop # E : 133*4882a593Smuzhiyun beq $1, $bigalign_b # U : 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun$alignmod64_b: 136*4882a593Smuzhiyun stq $17, 0($5) # L : 137*4882a593Smuzhiyun subq $3, 1, $3 # E : For consistency later 138*4882a593Smuzhiyun addq $1, 8, $1 # E : Increment towards zero for alignment 139*4882a593Smuzhiyun addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun nop 142*4882a593Smuzhiyun nop 143*4882a593Smuzhiyun addq $5, 8, $5 # E : Inc address 144*4882a593Smuzhiyun blt $1, $alignmod64_b # U : 145*4882a593Smuzhiyun 146*4882a593Smuzhiyun$bigalign_b: 147*4882a593Smuzhiyun /* 148*4882a593Smuzhiyun * $3 - number quads left to go 149*4882a593Smuzhiyun * $5 - target address (aligned 0mod64) 150*4882a593Smuzhiyun * $17 - mask of stuff to store 151*4882a593Smuzhiyun * Scratch registers available: $7, $2, $4, $1 152*4882a593Smuzhiyun * we know that we'll be taking a minimum of one trip through 153*4882a593Smuzhiyun * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 154*4882a593Smuzhiyun * Assumes the wh64 needs to be for 2 trips through the loop in the future 155*4882a593Smuzhiyun * The wh64 is issued on for the starting destination address for trip +2 156*4882a593Smuzhiyun * through the loop, and if there are less than two trips left, the target 157*4882a593Smuzhiyun * address will be for the current trip. 158*4882a593Smuzhiyun */ 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun$do_wh64_b: 161*4882a593Smuzhiyun wh64 ($4) # L1 : memory subsystem write hint 162*4882a593Smuzhiyun subq $3, 24, $2 # E : For determining future wh64 addresses 163*4882a593Smuzhiyun stq $17, 0($5) # L : 164*4882a593Smuzhiyun nop # E : 165*4882a593Smuzhiyun 166*4882a593Smuzhiyun addq $5, 128, $4 # E : speculative target of next wh64 167*4882a593Smuzhiyun stq $17, 8($5) # L : 168*4882a593Smuzhiyun stq $17, 16($5) # L : 169*4882a593Smuzhiyun addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun stq $17, 24($5) # L : 172*4882a593Smuzhiyun stq $17, 32($5) # L : 173*4882a593Smuzhiyun cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 174*4882a593Smuzhiyun nop 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun stq $17, 40($5) # L : 177*4882a593Smuzhiyun stq $17, 48($5) # L : 178*4882a593Smuzhiyun subq $3, 16, $2 # E : Repeat the loop at least once more? 179*4882a593Smuzhiyun nop 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun stq $17, 56($5) # L : 182*4882a593Smuzhiyun addq $5, 64, $5 # E : 183*4882a593Smuzhiyun subq $3, 8, $3 # E : 184*4882a593Smuzhiyun bge $2, $do_wh64_b # U : 185*4882a593Smuzhiyun 186*4882a593Smuzhiyun nop 187*4882a593Smuzhiyun nop 188*4882a593Smuzhiyun nop 189*4882a593Smuzhiyun beq $3, no_quad_b # U : Might have finished already 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun.align 4 192*4882a593Smuzhiyun /* 193*4882a593Smuzhiyun * Simple loop for trailing quadwords, or for small amounts 194*4882a593Smuzhiyun * of data (where we can't use an unrolled loop and wh64) 195*4882a593Smuzhiyun */ 196*4882a593Smuzhiyunloop_b: 197*4882a593Smuzhiyun stq $17,0($5) # L : 198*4882a593Smuzhiyun subq $3,1,$3 # E : Decrement number quads left 199*4882a593Smuzhiyun addq $5,8,$5 # E : Inc address 200*4882a593Smuzhiyun bne $3,loop_b # U : more? 201*4882a593Smuzhiyun 202*4882a593Smuzhiyunno_quad_b: 203*4882a593Smuzhiyun /* 204*4882a593Smuzhiyun * Write 0..7 trailing bytes. 205*4882a593Smuzhiyun */ 206*4882a593Smuzhiyun nop # E : 207*4882a593Smuzhiyun beq $18,end_b # U : All done? 208*4882a593Smuzhiyun ldq $7,0($5) # L : 209*4882a593Smuzhiyun mskqh $7,$6,$2 # U : Mask final quad 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun insqh $17,$6,$4 # U : New bits 212*4882a593Smuzhiyun bis $2,$4,$1 # E : Put it all together 213*4882a593Smuzhiyun stq $1,0($5) # L : And back to memory 214*4882a593Smuzhiyun ret $31,($26),1 # L0 : 215*4882a593Smuzhiyun 216*4882a593Smuzhiyunwithin_quad_b: 217*4882a593Smuzhiyun ldq_u $1,0($16) # L : 218*4882a593Smuzhiyun insql $17,$16,$2 # U : New bits 219*4882a593Smuzhiyun mskql $1,$16,$4 # U : Clear old 220*4882a593Smuzhiyun bis $2,$4,$2 # E : New result 221*4882a593Smuzhiyun 222*4882a593Smuzhiyun mskql $2,$6,$4 # U : 223*4882a593Smuzhiyun mskqh $1,$6,$2 # U : 224*4882a593Smuzhiyun bis $2,$4,$1 # E : 225*4882a593Smuzhiyun stq_u $1,0($16) # L : 226*4882a593Smuzhiyun 227*4882a593Smuzhiyunend_b: 228*4882a593Smuzhiyun nop 229*4882a593Smuzhiyun nop 230*4882a593Smuzhiyun nop 231*4882a593Smuzhiyun ret $31,($26),1 # L0 : 232*4882a593Smuzhiyun .end ___memset 233*4882a593Smuzhiyun EXPORT_SYMBOL(___memset) 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun /* 236*4882a593Smuzhiyun * This is the original body of code, prior to replication and 237*4882a593Smuzhiyun * rescheduling. Leave it here, as there may be calls to this 238*4882a593Smuzhiyun * entry point. 239*4882a593Smuzhiyun */ 240*4882a593Smuzhiyun.align 4 241*4882a593Smuzhiyun .ent __constant_c_memset 242*4882a593Smuzhiyun__constant_c_memset: 243*4882a593Smuzhiyun .frame $30,0,$26,0 244*4882a593Smuzhiyun .prologue 0 245*4882a593Smuzhiyun 246*4882a593Smuzhiyun addq $18,$16,$6 # E : max address to write to 247*4882a593Smuzhiyun bis $16,$16,$0 # E : return value 248*4882a593Smuzhiyun xor $16,$6,$1 # E : will complete write be within one quadword? 249*4882a593Smuzhiyun ble $18,end # U : zero length requested? 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun bic $1,7,$1 # E : fit within a single quadword 252*4882a593Smuzhiyun beq $1,within_one_quad # U : 253*4882a593Smuzhiyun and $16,7,$3 # E : Target addr misalignment 254*4882a593Smuzhiyun beq $3,aligned # U : target is 0mod8 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun /* 257*4882a593Smuzhiyun * Target address is misaligned, and won't fit within a quadword 258*4882a593Smuzhiyun */ 259*4882a593Smuzhiyun ldq_u $4,0($16) # L : Fetch first partial 260*4882a593Smuzhiyun bis $16,$16,$5 # E : Save the address 261*4882a593Smuzhiyun insql $17,$16,$2 # U : Insert new bytes 262*4882a593Smuzhiyun subq $3,8,$3 # E : Invert (for addressing uses) 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun addq $18,$3,$18 # E : $18 is new count ($3 is negative) 265*4882a593Smuzhiyun mskql $4,$16,$4 # U : clear relevant parts of the quad 266*4882a593Smuzhiyun subq $16,$3,$16 # E : $16 is new aligned destination 267*4882a593Smuzhiyun bis $2,$4,$1 # E : Final bytes 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun nop 270*4882a593Smuzhiyun stq_u $1,0($5) # L : Store result 271*4882a593Smuzhiyun nop 272*4882a593Smuzhiyun nop 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun.align 4 275*4882a593Smuzhiyunaligned: 276*4882a593Smuzhiyun /* 277*4882a593Smuzhiyun * We are now guaranteed to be quad aligned, with at least 278*4882a593Smuzhiyun * one partial quad to write. 279*4882a593Smuzhiyun */ 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun sra $18,3,$3 # U : Number of remaining quads to write 282*4882a593Smuzhiyun and $18,7,$18 # E : Number of trailing bytes to write 283*4882a593Smuzhiyun bis $16,$16,$5 # E : Save dest address 284*4882a593Smuzhiyun beq $3,no_quad # U : tail stuff only 285*4882a593Smuzhiyun 286*4882a593Smuzhiyun /* 287*4882a593Smuzhiyun * it's worth the effort to unroll this and use wh64 if possible 288*4882a593Smuzhiyun * Lifted a bunch of code from clear_user.S 289*4882a593Smuzhiyun * At this point, entry values are: 290*4882a593Smuzhiyun * $16 Current destination address 291*4882a593Smuzhiyun * $5 A copy of $16 292*4882a593Smuzhiyun * $6 The max quadword address to write to 293*4882a593Smuzhiyun * $18 Number trailer bytes 294*4882a593Smuzhiyun * $3 Number quads to write 295*4882a593Smuzhiyun */ 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 298*4882a593Smuzhiyun subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 299*4882a593Smuzhiyun subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 300*4882a593Smuzhiyun blt $4, loop # U : 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun /* 303*4882a593Smuzhiyun * We know we've got at least 16 quads, minimum of one trip 304*4882a593Smuzhiyun * through unrolled loop. Do a quad at a time to get us 0mod64 305*4882a593Smuzhiyun * aligned. 306*4882a593Smuzhiyun */ 307*4882a593Smuzhiyun 308*4882a593Smuzhiyun nop # E : 309*4882a593Smuzhiyun nop # E : 310*4882a593Smuzhiyun nop # E : 311*4882a593Smuzhiyun beq $1, $bigalign # U : 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun$alignmod64: 314*4882a593Smuzhiyun stq $17, 0($5) # L : 315*4882a593Smuzhiyun subq $3, 1, $3 # E : For consistency later 316*4882a593Smuzhiyun addq $1, 8, $1 # E : Increment towards zero for alignment 317*4882a593Smuzhiyun addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 318*4882a593Smuzhiyun 319*4882a593Smuzhiyun nop 320*4882a593Smuzhiyun nop 321*4882a593Smuzhiyun addq $5, 8, $5 # E : Inc address 322*4882a593Smuzhiyun blt $1, $alignmod64 # U : 323*4882a593Smuzhiyun 324*4882a593Smuzhiyun$bigalign: 325*4882a593Smuzhiyun /* 326*4882a593Smuzhiyun * $3 - number quads left to go 327*4882a593Smuzhiyun * $5 - target address (aligned 0mod64) 328*4882a593Smuzhiyun * $17 - mask of stuff to store 329*4882a593Smuzhiyun * Scratch registers available: $7, $2, $4, $1 330*4882a593Smuzhiyun * we know that we'll be taking a minimum of one trip through 331*4882a593Smuzhiyun * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 332*4882a593Smuzhiyun * Assumes the wh64 needs to be for 2 trips through the loop in the future 333*4882a593Smuzhiyun * The wh64 is issued on for the starting destination address for trip +2 334*4882a593Smuzhiyun * through the loop, and if there are less than two trips left, the target 335*4882a593Smuzhiyun * address will be for the current trip. 336*4882a593Smuzhiyun */ 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun$do_wh64: 339*4882a593Smuzhiyun wh64 ($4) # L1 : memory subsystem write hint 340*4882a593Smuzhiyun subq $3, 24, $2 # E : For determining future wh64 addresses 341*4882a593Smuzhiyun stq $17, 0($5) # L : 342*4882a593Smuzhiyun nop # E : 343*4882a593Smuzhiyun 344*4882a593Smuzhiyun addq $5, 128, $4 # E : speculative target of next wh64 345*4882a593Smuzhiyun stq $17, 8($5) # L : 346*4882a593Smuzhiyun stq $17, 16($5) # L : 347*4882a593Smuzhiyun addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun stq $17, 24($5) # L : 350*4882a593Smuzhiyun stq $17, 32($5) # L : 351*4882a593Smuzhiyun cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 352*4882a593Smuzhiyun nop 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun stq $17, 40($5) # L : 355*4882a593Smuzhiyun stq $17, 48($5) # L : 356*4882a593Smuzhiyun subq $3, 16, $2 # E : Repeat the loop at least once more? 357*4882a593Smuzhiyun nop 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun stq $17, 56($5) # L : 360*4882a593Smuzhiyun addq $5, 64, $5 # E : 361*4882a593Smuzhiyun subq $3, 8, $3 # E : 362*4882a593Smuzhiyun bge $2, $do_wh64 # U : 363*4882a593Smuzhiyun 364*4882a593Smuzhiyun nop 365*4882a593Smuzhiyun nop 366*4882a593Smuzhiyun nop 367*4882a593Smuzhiyun beq $3, no_quad # U : Might have finished already 368*4882a593Smuzhiyun 369*4882a593Smuzhiyun.align 4 370*4882a593Smuzhiyun /* 371*4882a593Smuzhiyun * Simple loop for trailing quadwords, or for small amounts 372*4882a593Smuzhiyun * of data (where we can't use an unrolled loop and wh64) 373*4882a593Smuzhiyun */ 374*4882a593Smuzhiyunloop: 375*4882a593Smuzhiyun stq $17,0($5) # L : 376*4882a593Smuzhiyun subq $3,1,$3 # E : Decrement number quads left 377*4882a593Smuzhiyun addq $5,8,$5 # E : Inc address 378*4882a593Smuzhiyun bne $3,loop # U : more? 379*4882a593Smuzhiyun 380*4882a593Smuzhiyunno_quad: 381*4882a593Smuzhiyun /* 382*4882a593Smuzhiyun * Write 0..7 trailing bytes. 383*4882a593Smuzhiyun */ 384*4882a593Smuzhiyun nop # E : 385*4882a593Smuzhiyun beq $18,end # U : All done? 386*4882a593Smuzhiyun ldq $7,0($5) # L : 387*4882a593Smuzhiyun mskqh $7,$6,$2 # U : Mask final quad 388*4882a593Smuzhiyun 389*4882a593Smuzhiyun insqh $17,$6,$4 # U : New bits 390*4882a593Smuzhiyun bis $2,$4,$1 # E : Put it all together 391*4882a593Smuzhiyun stq $1,0($5) # L : And back to memory 392*4882a593Smuzhiyun ret $31,($26),1 # L0 : 393*4882a593Smuzhiyun 394*4882a593Smuzhiyunwithin_one_quad: 395*4882a593Smuzhiyun ldq_u $1,0($16) # L : 396*4882a593Smuzhiyun insql $17,$16,$2 # U : New bits 397*4882a593Smuzhiyun mskql $1,$16,$4 # U : Clear old 398*4882a593Smuzhiyun bis $2,$4,$2 # E : New result 399*4882a593Smuzhiyun 400*4882a593Smuzhiyun mskql $2,$6,$4 # U : 401*4882a593Smuzhiyun mskqh $1,$6,$2 # U : 402*4882a593Smuzhiyun bis $2,$4,$1 # E : 403*4882a593Smuzhiyun stq_u $1,0($16) # L : 404*4882a593Smuzhiyun 405*4882a593Smuzhiyunend: 406*4882a593Smuzhiyun nop 407*4882a593Smuzhiyun nop 408*4882a593Smuzhiyun nop 409*4882a593Smuzhiyun ret $31,($26),1 # L0 : 410*4882a593Smuzhiyun .end __constant_c_memset 411*4882a593Smuzhiyun EXPORT_SYMBOL(__constant_c_memset) 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun /* 414*4882a593Smuzhiyun * This is a replicant of the __constant_c_memset code, rescheduled 415*4882a593Smuzhiyun * to mask stalls. Note that entry point names also had to change 416*4882a593Smuzhiyun */ 417*4882a593Smuzhiyun .align 5 418*4882a593Smuzhiyun .ent __memset16 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun__memset16: 421*4882a593Smuzhiyun .frame $30,0,$26,0 422*4882a593Smuzhiyun .prologue 0 423*4882a593Smuzhiyun 424*4882a593Smuzhiyun inswl $17,0,$5 # U : 000000000000c1c2 425*4882a593Smuzhiyun inswl $17,2,$2 # U : 00000000c1c20000 426*4882a593Smuzhiyun bis $16,$16,$0 # E : return value 427*4882a593Smuzhiyun addq $18,$16,$6 # E : max address to write to 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun ble $18, end_w # U : zero length requested? 430*4882a593Smuzhiyun inswl $17,4,$3 # U : 0000c1c200000000 431*4882a593Smuzhiyun inswl $17,6,$4 # U : c1c2000000000000 432*4882a593Smuzhiyun xor $16,$6,$1 # E : will complete write be within one quadword? 433*4882a593Smuzhiyun 434*4882a593Smuzhiyun or $2,$5,$2 # E : 00000000c1c2c1c2 435*4882a593Smuzhiyun or $3,$4,$17 # E : c1c2c1c200000000 436*4882a593Smuzhiyun bic $1,7,$1 # E : fit within a single quadword 437*4882a593Smuzhiyun and $16,7,$3 # E : Target addr misalignment 438*4882a593Smuzhiyun 439*4882a593Smuzhiyun or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 440*4882a593Smuzhiyun beq $1,within_quad_w # U : 441*4882a593Smuzhiyun nop 442*4882a593Smuzhiyun beq $3,aligned_w # U : target is 0mod8 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun /* 445*4882a593Smuzhiyun * Target address is misaligned, and won't fit within a quadword 446*4882a593Smuzhiyun */ 447*4882a593Smuzhiyun ldq_u $4,0($16) # L : Fetch first partial 448*4882a593Smuzhiyun bis $16,$16,$5 # E : Save the address 449*4882a593Smuzhiyun insql $17,$16,$2 # U : Insert new bytes 450*4882a593Smuzhiyun subq $3,8,$3 # E : Invert (for addressing uses) 451*4882a593Smuzhiyun 452*4882a593Smuzhiyun addq $18,$3,$18 # E : $18 is new count ($3 is negative) 453*4882a593Smuzhiyun mskql $4,$16,$4 # U : clear relevant parts of the quad 454*4882a593Smuzhiyun subq $16,$3,$16 # E : $16 is new aligned destination 455*4882a593Smuzhiyun bis $2,$4,$1 # E : Final bytes 456*4882a593Smuzhiyun 457*4882a593Smuzhiyun nop 458*4882a593Smuzhiyun stq_u $1,0($5) # L : Store result 459*4882a593Smuzhiyun nop 460*4882a593Smuzhiyun nop 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun.align 4 463*4882a593Smuzhiyunaligned_w: 464*4882a593Smuzhiyun /* 465*4882a593Smuzhiyun * We are now guaranteed to be quad aligned, with at least 466*4882a593Smuzhiyun * one partial quad to write. 467*4882a593Smuzhiyun */ 468*4882a593Smuzhiyun 469*4882a593Smuzhiyun sra $18,3,$3 # U : Number of remaining quads to write 470*4882a593Smuzhiyun and $18,7,$18 # E : Number of trailing bytes to write 471*4882a593Smuzhiyun bis $16,$16,$5 # E : Save dest address 472*4882a593Smuzhiyun beq $3,no_quad_w # U : tail stuff only 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun /* 475*4882a593Smuzhiyun * it's worth the effort to unroll this and use wh64 if possible 476*4882a593Smuzhiyun * Lifted a bunch of code from clear_user.S 477*4882a593Smuzhiyun * At this point, entry values are: 478*4882a593Smuzhiyun * $16 Current destination address 479*4882a593Smuzhiyun * $5 A copy of $16 480*4882a593Smuzhiyun * $6 The max quadword address to write to 481*4882a593Smuzhiyun * $18 Number trailer bytes 482*4882a593Smuzhiyun * $3 Number quads to write 483*4882a593Smuzhiyun */ 484*4882a593Smuzhiyun 485*4882a593Smuzhiyun and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 486*4882a593Smuzhiyun subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 487*4882a593Smuzhiyun subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 488*4882a593Smuzhiyun blt $4, loop_w # U : 489*4882a593Smuzhiyun 490*4882a593Smuzhiyun /* 491*4882a593Smuzhiyun * We know we've got at least 16 quads, minimum of one trip 492*4882a593Smuzhiyun * through unrolled loop. Do a quad at a time to get us 0mod64 493*4882a593Smuzhiyun * aligned. 494*4882a593Smuzhiyun */ 495*4882a593Smuzhiyun 496*4882a593Smuzhiyun nop # E : 497*4882a593Smuzhiyun nop # E : 498*4882a593Smuzhiyun nop # E : 499*4882a593Smuzhiyun beq $1, $bigalign_w # U : 500*4882a593Smuzhiyun 501*4882a593Smuzhiyun$alignmod64_w: 502*4882a593Smuzhiyun stq $17, 0($5) # L : 503*4882a593Smuzhiyun subq $3, 1, $3 # E : For consistency later 504*4882a593Smuzhiyun addq $1, 8, $1 # E : Increment towards zero for alignment 505*4882a593Smuzhiyun addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 506*4882a593Smuzhiyun 507*4882a593Smuzhiyun nop 508*4882a593Smuzhiyun nop 509*4882a593Smuzhiyun addq $5, 8, $5 # E : Inc address 510*4882a593Smuzhiyun blt $1, $alignmod64_w # U : 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun$bigalign_w: 513*4882a593Smuzhiyun /* 514*4882a593Smuzhiyun * $3 - number quads left to go 515*4882a593Smuzhiyun * $5 - target address (aligned 0mod64) 516*4882a593Smuzhiyun * $17 - mask of stuff to store 517*4882a593Smuzhiyun * Scratch registers available: $7, $2, $4, $1 518*4882a593Smuzhiyun * we know that we'll be taking a minimum of one trip through 519*4882a593Smuzhiyun * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 520*4882a593Smuzhiyun * Assumes the wh64 needs to be for 2 trips through the loop in the future 521*4882a593Smuzhiyun * The wh64 is issued on for the starting destination address for trip +2 522*4882a593Smuzhiyun * through the loop, and if there are less than two trips left, the target 523*4882a593Smuzhiyun * address will be for the current trip. 524*4882a593Smuzhiyun */ 525*4882a593Smuzhiyun 526*4882a593Smuzhiyun$do_wh64_w: 527*4882a593Smuzhiyun wh64 ($4) # L1 : memory subsystem write hint 528*4882a593Smuzhiyun subq $3, 24, $2 # E : For determining future wh64 addresses 529*4882a593Smuzhiyun stq $17, 0($5) # L : 530*4882a593Smuzhiyun nop # E : 531*4882a593Smuzhiyun 532*4882a593Smuzhiyun addq $5, 128, $4 # E : speculative target of next wh64 533*4882a593Smuzhiyun stq $17, 8($5) # L : 534*4882a593Smuzhiyun stq $17, 16($5) # L : 535*4882a593Smuzhiyun addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 536*4882a593Smuzhiyun 537*4882a593Smuzhiyun stq $17, 24($5) # L : 538*4882a593Smuzhiyun stq $17, 32($5) # L : 539*4882a593Smuzhiyun cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 540*4882a593Smuzhiyun nop 541*4882a593Smuzhiyun 542*4882a593Smuzhiyun stq $17, 40($5) # L : 543*4882a593Smuzhiyun stq $17, 48($5) # L : 544*4882a593Smuzhiyun subq $3, 16, $2 # E : Repeat the loop at least once more? 545*4882a593Smuzhiyun nop 546*4882a593Smuzhiyun 547*4882a593Smuzhiyun stq $17, 56($5) # L : 548*4882a593Smuzhiyun addq $5, 64, $5 # E : 549*4882a593Smuzhiyun subq $3, 8, $3 # E : 550*4882a593Smuzhiyun bge $2, $do_wh64_w # U : 551*4882a593Smuzhiyun 552*4882a593Smuzhiyun nop 553*4882a593Smuzhiyun nop 554*4882a593Smuzhiyun nop 555*4882a593Smuzhiyun beq $3, no_quad_w # U : Might have finished already 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun.align 4 558*4882a593Smuzhiyun /* 559*4882a593Smuzhiyun * Simple loop for trailing quadwords, or for small amounts 560*4882a593Smuzhiyun * of data (where we can't use an unrolled loop and wh64) 561*4882a593Smuzhiyun */ 562*4882a593Smuzhiyunloop_w: 563*4882a593Smuzhiyun stq $17,0($5) # L : 564*4882a593Smuzhiyun subq $3,1,$3 # E : Decrement number quads left 565*4882a593Smuzhiyun addq $5,8,$5 # E : Inc address 566*4882a593Smuzhiyun bne $3,loop_w # U : more? 567*4882a593Smuzhiyun 568*4882a593Smuzhiyunno_quad_w: 569*4882a593Smuzhiyun /* 570*4882a593Smuzhiyun * Write 0..7 trailing bytes. 571*4882a593Smuzhiyun */ 572*4882a593Smuzhiyun nop # E : 573*4882a593Smuzhiyun beq $18,end_w # U : All done? 574*4882a593Smuzhiyun ldq $7,0($5) # L : 575*4882a593Smuzhiyun mskqh $7,$6,$2 # U : Mask final quad 576*4882a593Smuzhiyun 577*4882a593Smuzhiyun insqh $17,$6,$4 # U : New bits 578*4882a593Smuzhiyun bis $2,$4,$1 # E : Put it all together 579*4882a593Smuzhiyun stq $1,0($5) # L : And back to memory 580*4882a593Smuzhiyun ret $31,($26),1 # L0 : 581*4882a593Smuzhiyun 582*4882a593Smuzhiyunwithin_quad_w: 583*4882a593Smuzhiyun ldq_u $1,0($16) # L : 584*4882a593Smuzhiyun insql $17,$16,$2 # U : New bits 585*4882a593Smuzhiyun mskql $1,$16,$4 # U : Clear old 586*4882a593Smuzhiyun bis $2,$4,$2 # E : New result 587*4882a593Smuzhiyun 588*4882a593Smuzhiyun mskql $2,$6,$4 # U : 589*4882a593Smuzhiyun mskqh $1,$6,$2 # U : 590*4882a593Smuzhiyun bis $2,$4,$1 # E : 591*4882a593Smuzhiyun stq_u $1,0($16) # L : 592*4882a593Smuzhiyun 593*4882a593Smuzhiyunend_w: 594*4882a593Smuzhiyun nop 595*4882a593Smuzhiyun nop 596*4882a593Smuzhiyun nop 597*4882a593Smuzhiyun ret $31,($26),1 # L0 : 598*4882a593Smuzhiyun 599*4882a593Smuzhiyun .end __memset16 600*4882a593Smuzhiyun EXPORT_SYMBOL(__memset16) 601*4882a593Smuzhiyun 602*4882a593Smuzhiyunmemset = ___memset 603*4882a593Smuzhiyun__memset = ___memset 604*4882a593Smuzhiyun EXPORT_SYMBOL(memset) 605*4882a593Smuzhiyun EXPORT_SYMBOL(__memset) 606