1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-memcpy.S 4*4882a593Smuzhiyun * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * Reasonably optimized memcpy() routine for the Alpha 21264 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * - memory accessed as aligned quadwords only 9*4882a593Smuzhiyun * - uses bcmpge to compare 8 bytes in parallel 10*4882a593Smuzhiyun * 11*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from: 12*4882a593Smuzhiyun * Compiler Writer's Guide for the Alpha 21264 13*4882a593Smuzhiyun * abbreviated as 'CWG' in other comments here 14*4882a593Smuzhiyun * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 15*4882a593Smuzhiyun * Scheduling notation: 16*4882a593Smuzhiyun * E - either cluster 17*4882a593Smuzhiyun * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 18*4882a593Smuzhiyun * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 19*4882a593Smuzhiyun * 20*4882a593Smuzhiyun * Temp usage notes: 21*4882a593Smuzhiyun * $1,$2, - scratch 22*4882a593Smuzhiyun */ 23*4882a593Smuzhiyun#include <asm/export.h> 24*4882a593Smuzhiyun .set noreorder 25*4882a593Smuzhiyun .set noat 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun .align 4 28*4882a593Smuzhiyun .globl memcpy 29*4882a593Smuzhiyun .ent memcpy 30*4882a593Smuzhiyunmemcpy: 31*4882a593Smuzhiyun .frame $30,0,$26,0 32*4882a593Smuzhiyun .prologue 0 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun mov $16, $0 # E : copy dest to return 35*4882a593Smuzhiyun ble $18, $nomoredata # U : done with the copy? 36*4882a593Smuzhiyun xor $16, $17, $1 # E : are source and dest alignments the same? 37*4882a593Smuzhiyun and $1, 7, $1 # E : are they the same mod 8? 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun bne $1, $misaligned # U : Nope - gotta do this the slow way 40*4882a593Smuzhiyun /* source and dest are same mod 8 address */ 41*4882a593Smuzhiyun and $16, 7, $1 # E : Are both 0mod8? 42*4882a593Smuzhiyun beq $1, $both_0mod8 # U : Yes 43*4882a593Smuzhiyun nop # E : 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun /* 46*4882a593Smuzhiyun * source and dest are same misalignment. move a byte at a time 47*4882a593Smuzhiyun * until a 0mod8 alignment for both is reached. 48*4882a593Smuzhiyun * At least one byte more to move 49*4882a593Smuzhiyun */ 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun$head_align: 52*4882a593Smuzhiyun ldbu $1, 0($17) # L : grab a byte 53*4882a593Smuzhiyun subq $18, 1, $18 # E : count-- 54*4882a593Smuzhiyun addq $17, 1, $17 # E : src++ 55*4882a593Smuzhiyun stb $1, 0($16) # L : 56*4882a593Smuzhiyun addq $16, 1, $16 # E : dest++ 57*4882a593Smuzhiyun and $16, 7, $1 # E : Are we at 0mod8 yet? 58*4882a593Smuzhiyun ble $18, $nomoredata # U : done with the copy? 59*4882a593Smuzhiyun bne $1, $head_align # U : 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun$both_0mod8: 62*4882a593Smuzhiyun cmple $18, 127, $1 # E : Can we unroll the loop? 63*4882a593Smuzhiyun bne $1, $no_unroll # U : 64*4882a593Smuzhiyun and $16, 63, $1 # E : get mod64 alignment 65*4882a593Smuzhiyun beq $1, $do_unroll # U : no single quads to fiddle 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun$single_head_quad: 68*4882a593Smuzhiyun ldq $1, 0($17) # L : get 8 bytes 69*4882a593Smuzhiyun subq $18, 8, $18 # E : count -= 8 70*4882a593Smuzhiyun addq $17, 8, $17 # E : src += 8 71*4882a593Smuzhiyun nop # E : 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun stq $1, 0($16) # L : store 74*4882a593Smuzhiyun addq $16, 8, $16 # E : dest += 8 75*4882a593Smuzhiyun and $16, 63, $1 # E : get mod64 alignment 76*4882a593Smuzhiyun bne $1, $single_head_quad # U : still not fully aligned 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun$do_unroll: 79*4882a593Smuzhiyun addq $16, 64, $7 # E : Initial (+1 trip) wh64 address 80*4882a593Smuzhiyun cmple $18, 127, $1 # E : Can we go through the unrolled loop? 81*4882a593Smuzhiyun bne $1, $tail_quads # U : Nope 82*4882a593Smuzhiyun nop # E : 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun$unroll_body: 85*4882a593Smuzhiyun wh64 ($7) # L1 : memory subsystem hint: 64 bytes at 86*4882a593Smuzhiyun # ($7) are about to be over-written 87*4882a593Smuzhiyun ldq $6, 0($17) # L0 : bytes 0..7 88*4882a593Smuzhiyun nop # E : 89*4882a593Smuzhiyun nop # E : 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun ldq $4, 8($17) # L : bytes 8..15 92*4882a593Smuzhiyun ldq $5, 16($17) # L : bytes 16..23 93*4882a593Smuzhiyun addq $7, 64, $7 # E : Update next wh64 address 94*4882a593Smuzhiyun nop # E : 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun ldq $3, 24($17) # L : bytes 24..31 97*4882a593Smuzhiyun addq $16, 64, $1 # E : fallback value for wh64 98*4882a593Smuzhiyun nop # E : 99*4882a593Smuzhiyun nop # E : 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun addq $17, 32, $17 # E : src += 32 bytes 102*4882a593Smuzhiyun stq $6, 0($16) # L : bytes 0..7 103*4882a593Smuzhiyun nop # E : 104*4882a593Smuzhiyun nop # E : 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun stq $4, 8($16) # L : bytes 8..15 107*4882a593Smuzhiyun stq $5, 16($16) # L : bytes 16..23 108*4882a593Smuzhiyun subq $18, 192, $2 # E : At least two more trips to go? 109*4882a593Smuzhiyun nop # E : 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun stq $3, 24($16) # L : bytes 24..31 112*4882a593Smuzhiyun addq $16, 32, $16 # E : dest += 32 bytes 113*4882a593Smuzhiyun nop # E : 114*4882a593Smuzhiyun nop # E : 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun ldq $6, 0($17) # L : bytes 0..7 117*4882a593Smuzhiyun ldq $4, 8($17) # L : bytes 8..15 118*4882a593Smuzhiyun cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use 119*4882a593Smuzhiyun # fallback wh64 address if < 2 more trips 120*4882a593Smuzhiyun nop # E : 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun ldq $5, 16($17) # L : bytes 16..23 123*4882a593Smuzhiyun ldq $3, 24($17) # L : bytes 24..31 124*4882a593Smuzhiyun addq $16, 32, $16 # E : dest += 32 125*4882a593Smuzhiyun subq $18, 64, $18 # E : count -= 64 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun addq $17, 32, $17 # E : src += 32 128*4882a593Smuzhiyun stq $6, -32($16) # L : bytes 0..7 129*4882a593Smuzhiyun stq $4, -24($16) # L : bytes 8..15 130*4882a593Smuzhiyun cmple $18, 63, $1 # E : At least one more trip? 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun stq $5, -16($16) # L : bytes 16..23 133*4882a593Smuzhiyun stq $3, -8($16) # L : bytes 24..31 134*4882a593Smuzhiyun nop # E : 135*4882a593Smuzhiyun beq $1, $unroll_body 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun$tail_quads: 138*4882a593Smuzhiyun$no_unroll: 139*4882a593Smuzhiyun .align 4 140*4882a593Smuzhiyun subq $18, 8, $18 # E : At least a quad left? 141*4882a593Smuzhiyun blt $18, $less_than_8 # U : Nope 142*4882a593Smuzhiyun nop # E : 143*4882a593Smuzhiyun nop # E : 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun$move_a_quad: 146*4882a593Smuzhiyun ldq $1, 0($17) # L : fetch 8 147*4882a593Smuzhiyun subq $18, 8, $18 # E : count -= 8 148*4882a593Smuzhiyun addq $17, 8, $17 # E : src += 8 149*4882a593Smuzhiyun nop # E : 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun stq $1, 0($16) # L : store 8 152*4882a593Smuzhiyun addq $16, 8, $16 # E : dest += 8 153*4882a593Smuzhiyun bge $18, $move_a_quad # U : 154*4882a593Smuzhiyun nop # E : 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun$less_than_8: 157*4882a593Smuzhiyun .align 4 158*4882a593Smuzhiyun addq $18, 8, $18 # E : add back for trailing bytes 159*4882a593Smuzhiyun ble $18, $nomoredata # U : All-done 160*4882a593Smuzhiyun nop # E : 161*4882a593Smuzhiyun nop # E : 162*4882a593Smuzhiyun 163*4882a593Smuzhiyun /* Trailing bytes */ 164*4882a593Smuzhiyun$tail_bytes: 165*4882a593Smuzhiyun subq $18, 1, $18 # E : count-- 166*4882a593Smuzhiyun ldbu $1, 0($17) # L : fetch a byte 167*4882a593Smuzhiyun addq $17, 1, $17 # E : src++ 168*4882a593Smuzhiyun nop # E : 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun stb $1, 0($16) # L : store a byte 171*4882a593Smuzhiyun addq $16, 1, $16 # E : dest++ 172*4882a593Smuzhiyun bgt $18, $tail_bytes # U : more to be done? 173*4882a593Smuzhiyun nop # E : 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun /* branching to exit takes 3 extra cycles, so replicate exit here */ 176*4882a593Smuzhiyun ret $31, ($26), 1 # L0 : 177*4882a593Smuzhiyun nop # E : 178*4882a593Smuzhiyun nop # E : 179*4882a593Smuzhiyun nop # E : 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun$misaligned: 182*4882a593Smuzhiyun mov $0, $4 # E : dest temp 183*4882a593Smuzhiyun and $0, 7, $1 # E : dest alignment mod8 184*4882a593Smuzhiyun beq $1, $dest_0mod8 # U : life doesnt totally suck 185*4882a593Smuzhiyun nop 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun$aligndest: 188*4882a593Smuzhiyun ble $18, $nomoredata # U : 189*4882a593Smuzhiyun ldbu $1, 0($17) # L : fetch a byte 190*4882a593Smuzhiyun subq $18, 1, $18 # E : count-- 191*4882a593Smuzhiyun addq $17, 1, $17 # E : src++ 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun stb $1, 0($4) # L : store it 194*4882a593Smuzhiyun addq $4, 1, $4 # E : dest++ 195*4882a593Smuzhiyun and $4, 7, $1 # E : dest 0mod8 yet? 196*4882a593Smuzhiyun bne $1, $aligndest # U : go until we are aligned. 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun /* Source has unknown alignment, but dest is known to be 0mod8 */ 199*4882a593Smuzhiyun$dest_0mod8: 200*4882a593Smuzhiyun subq $18, 8, $18 # E : At least a quad left? 201*4882a593Smuzhiyun blt $18, $misalign_tail # U : Nope 202*4882a593Smuzhiyun ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes 203*4882a593Smuzhiyun nop # E : 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun$mis_quad: 206*4882a593Smuzhiyun ldq_u $16, 8($17) # L : Fetch next 8 207*4882a593Smuzhiyun extql $3, $17, $3 # U : masking 208*4882a593Smuzhiyun extqh $16, $17, $1 # U : masking 209*4882a593Smuzhiyun bis $3, $1, $1 # E : merged bytes to store 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun subq $18, 8, $18 # E : count -= 8 212*4882a593Smuzhiyun addq $17, 8, $17 # E : src += 8 213*4882a593Smuzhiyun stq $1, 0($4) # L : store 8 (aligned) 214*4882a593Smuzhiyun mov $16, $3 # E : "rotate" source data 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun addq $4, 8, $4 # E : dest += 8 217*4882a593Smuzhiyun bge $18, $mis_quad # U : More quads to move 218*4882a593Smuzhiyun nop 219*4882a593Smuzhiyun nop 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun$misalign_tail: 222*4882a593Smuzhiyun addq $18, 8, $18 # E : account for tail stuff 223*4882a593Smuzhiyun ble $18, $nomoredata # U : 224*4882a593Smuzhiyun nop 225*4882a593Smuzhiyun nop 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun$misalign_byte: 228*4882a593Smuzhiyun ldbu $1, 0($17) # L : fetch 1 229*4882a593Smuzhiyun subq $18, 1, $18 # E : count-- 230*4882a593Smuzhiyun addq $17, 1, $17 # E : src++ 231*4882a593Smuzhiyun nop # E : 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun stb $1, 0($4) # L : store 234*4882a593Smuzhiyun addq $4, 1, $4 # E : dest++ 235*4882a593Smuzhiyun bgt $18, $misalign_byte # U : more to go? 236*4882a593Smuzhiyun nop 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun$nomoredata: 240*4882a593Smuzhiyun ret $31, ($26), 1 # L0 : 241*4882a593Smuzhiyun nop # E : 242*4882a593Smuzhiyun nop # E : 243*4882a593Smuzhiyun nop # E : 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun .end memcpy 246*4882a593Smuzhiyun EXPORT_SYMBOL(memcpy) 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun/* For backwards module compatibility. */ 249*4882a593Smuzhiyun__memcpy = memcpy 250*4882a593Smuzhiyun.globl __memcpy 251