1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (c) 2011, The Linux Foundation. All rights reserved. 4*4882a593Smuzhiyun */ 5*4882a593Smuzhiyun 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun/* HEXAGON assembly optimized memset */ 8*4882a593Smuzhiyun/* Replaces the standard library function memset */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun .macro HEXAGON_OPT_FUNC_BEGIN name 12*4882a593Smuzhiyun .text 13*4882a593Smuzhiyun .p2align 4 14*4882a593Smuzhiyun .globl \name 15*4882a593Smuzhiyun .type \name, @function 16*4882a593Smuzhiyun\name: 17*4882a593Smuzhiyun .endm 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun .macro HEXAGON_OPT_FUNC_FINISH name 20*4882a593Smuzhiyun .size \name, . - \name 21*4882a593Smuzhiyun .endm 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun/* FUNCTION: memset (v2 version) */ 24*4882a593Smuzhiyun#if __HEXAGON_ARCH__ < 3 25*4882a593SmuzhiyunHEXAGON_OPT_FUNC_BEGIN memset 26*4882a593Smuzhiyun { 27*4882a593Smuzhiyun r6 = #8 28*4882a593Smuzhiyun r7 = extractu(r0, #3 , #0) 29*4882a593Smuzhiyun p0 = cmp.eq(r2, #0) 30*4882a593Smuzhiyun p1 = cmp.gtu(r2, #7) 31*4882a593Smuzhiyun } 32*4882a593Smuzhiyun { 33*4882a593Smuzhiyun r4 = vsplatb(r1) 34*4882a593Smuzhiyun r8 = r0 /* leave r0 intact for return val */ 35*4882a593Smuzhiyun r9 = sub(r6, r7) /* bytes until double alignment */ 36*4882a593Smuzhiyun if p0 jumpr r31 /* count == 0, so return */ 37*4882a593Smuzhiyun } 38*4882a593Smuzhiyun { 39*4882a593Smuzhiyun r3 = #0 40*4882a593Smuzhiyun r7 = #0 41*4882a593Smuzhiyun p0 = tstbit(r9, #0) 42*4882a593Smuzhiyun if p1 jump 2f /* skip byte loop */ 43*4882a593Smuzhiyun } 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun/* less than 8 bytes to set, so just set a byte at a time and return */ 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun loop0(1f, r2) /* byte loop */ 48*4882a593Smuzhiyun .falign 49*4882a593Smuzhiyun1: /* byte loop */ 50*4882a593Smuzhiyun { 51*4882a593Smuzhiyun memb(r8++#1) = r4 52*4882a593Smuzhiyun }:endloop0 53*4882a593Smuzhiyun jumpr r31 54*4882a593Smuzhiyun .falign 55*4882a593Smuzhiyun2: /* skip byte loop */ 56*4882a593Smuzhiyun { 57*4882a593Smuzhiyun r6 = #1 58*4882a593Smuzhiyun p0 = tstbit(r9, #1) 59*4882a593Smuzhiyun p1 = cmp.eq(r2, #1) 60*4882a593Smuzhiyun if !p0 jump 3f /* skip initial byte store */ 61*4882a593Smuzhiyun } 62*4882a593Smuzhiyun { 63*4882a593Smuzhiyun memb(r8++#1) = r4 64*4882a593Smuzhiyun r3:2 = sub(r3:2, r7:6) 65*4882a593Smuzhiyun if p1 jumpr r31 66*4882a593Smuzhiyun } 67*4882a593Smuzhiyun .falign 68*4882a593Smuzhiyun3: /* skip initial byte store */ 69*4882a593Smuzhiyun { 70*4882a593Smuzhiyun r6 = #2 71*4882a593Smuzhiyun p0 = tstbit(r9, #2) 72*4882a593Smuzhiyun p1 = cmp.eq(r2, #2) 73*4882a593Smuzhiyun if !p0 jump 4f /* skip initial half store */ 74*4882a593Smuzhiyun } 75*4882a593Smuzhiyun { 76*4882a593Smuzhiyun memh(r8++#2) = r4 77*4882a593Smuzhiyun r3:2 = sub(r3:2, r7:6) 78*4882a593Smuzhiyun if p1 jumpr r31 79*4882a593Smuzhiyun } 80*4882a593Smuzhiyun .falign 81*4882a593Smuzhiyun4: /* skip initial half store */ 82*4882a593Smuzhiyun { 83*4882a593Smuzhiyun r6 = #4 84*4882a593Smuzhiyun p0 = cmp.gtu(r2, #7) 85*4882a593Smuzhiyun p1 = cmp.eq(r2, #4) 86*4882a593Smuzhiyun if !p0 jump 5f /* skip initial word store */ 87*4882a593Smuzhiyun } 88*4882a593Smuzhiyun { 89*4882a593Smuzhiyun memw(r8++#4) = r4 90*4882a593Smuzhiyun r3:2 = sub(r3:2, r7:6) 91*4882a593Smuzhiyun p0 = cmp.gtu(r2, #11) 92*4882a593Smuzhiyun if p1 jumpr r31 93*4882a593Smuzhiyun } 94*4882a593Smuzhiyun .falign 95*4882a593Smuzhiyun5: /* skip initial word store */ 96*4882a593Smuzhiyun { 97*4882a593Smuzhiyun r10 = lsr(r2, #3) 98*4882a593Smuzhiyun p1 = cmp.eq(r3, #1) 99*4882a593Smuzhiyun if !p0 jump 7f /* skip double loop */ 100*4882a593Smuzhiyun } 101*4882a593Smuzhiyun { 102*4882a593Smuzhiyun r5 = r4 103*4882a593Smuzhiyun r6 = #8 104*4882a593Smuzhiyun loop0(6f, r10) /* double loop */ 105*4882a593Smuzhiyun } 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun/* set bytes a double word at a time */ 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun .falign 110*4882a593Smuzhiyun6: /* double loop */ 111*4882a593Smuzhiyun { 112*4882a593Smuzhiyun memd(r8++#8) = r5:4 113*4882a593Smuzhiyun r3:2 = sub(r3:2, r7:6) 114*4882a593Smuzhiyun p1 = cmp.eq(r2, #8) 115*4882a593Smuzhiyun }:endloop0 116*4882a593Smuzhiyun .falign 117*4882a593Smuzhiyun7: /* skip double loop */ 118*4882a593Smuzhiyun { 119*4882a593Smuzhiyun p0 = tstbit(r2, #2) 120*4882a593Smuzhiyun if p1 jumpr r31 121*4882a593Smuzhiyun } 122*4882a593Smuzhiyun { 123*4882a593Smuzhiyun r6 = #4 124*4882a593Smuzhiyun p0 = tstbit(r2, #1) 125*4882a593Smuzhiyun p1 = cmp.eq(r2, #4) 126*4882a593Smuzhiyun if !p0 jump 8f /* skip final word store */ 127*4882a593Smuzhiyun } 128*4882a593Smuzhiyun { 129*4882a593Smuzhiyun memw(r8++#4) = r4 130*4882a593Smuzhiyun r3:2 = sub(r3:2, r7:6) 131*4882a593Smuzhiyun if p1 jumpr r31 132*4882a593Smuzhiyun } 133*4882a593Smuzhiyun .falign 134*4882a593Smuzhiyun8: /* skip final word store */ 135*4882a593Smuzhiyun { 136*4882a593Smuzhiyun p1 = cmp.eq(r2, #2) 137*4882a593Smuzhiyun if !p0 jump 9f /* skip final half store */ 138*4882a593Smuzhiyun } 139*4882a593Smuzhiyun { 140*4882a593Smuzhiyun memh(r8++#2) = r4 141*4882a593Smuzhiyun if p1 jumpr r31 142*4882a593Smuzhiyun } 143*4882a593Smuzhiyun .falign 144*4882a593Smuzhiyun9: /* skip final half store */ 145*4882a593Smuzhiyun { 146*4882a593Smuzhiyun memb(r8++#1) = r4 147*4882a593Smuzhiyun jumpr r31 148*4882a593Smuzhiyun } 149*4882a593SmuzhiyunHEXAGON_OPT_FUNC_FINISH memset 150*4882a593Smuzhiyun#endif 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun/* FUNCTION: memset (v3 and higher version) */ 154*4882a593Smuzhiyun#if __HEXAGON_ARCH__ >= 3 155*4882a593SmuzhiyunHEXAGON_OPT_FUNC_BEGIN memset 156*4882a593Smuzhiyun { 157*4882a593Smuzhiyun r7=vsplatb(r1) 158*4882a593Smuzhiyun r6 = r0 159*4882a593Smuzhiyun if (r2==#0) jump:nt .L1 160*4882a593Smuzhiyun } 161*4882a593Smuzhiyun { 162*4882a593Smuzhiyun r5:4=combine(r7,r7) 163*4882a593Smuzhiyun p0 = cmp.gtu(r2,#8) 164*4882a593Smuzhiyun if (p0.new) jump:nt .L3 165*4882a593Smuzhiyun } 166*4882a593Smuzhiyun { 167*4882a593Smuzhiyun r3 = r0 168*4882a593Smuzhiyun loop0(.L47,r2) 169*4882a593Smuzhiyun } 170*4882a593Smuzhiyun .falign 171*4882a593Smuzhiyun.L47: 172*4882a593Smuzhiyun { 173*4882a593Smuzhiyun memb(r3++#1) = r1 174*4882a593Smuzhiyun }:endloop0 /* start=.L47 */ 175*4882a593Smuzhiyun jumpr r31 176*4882a593Smuzhiyun.L3: 177*4882a593Smuzhiyun { 178*4882a593Smuzhiyun p0 = tstbit(r0,#0) 179*4882a593Smuzhiyun if (!p0.new) jump:nt .L8 180*4882a593Smuzhiyun p1 = cmp.eq(r2, #1) 181*4882a593Smuzhiyun } 182*4882a593Smuzhiyun { 183*4882a593Smuzhiyun r6 = add(r0, #1) 184*4882a593Smuzhiyun r2 = add(r2,#-1) 185*4882a593Smuzhiyun memb(r0) = r1 186*4882a593Smuzhiyun if (p1) jump .L1 187*4882a593Smuzhiyun } 188*4882a593Smuzhiyun.L8: 189*4882a593Smuzhiyun { 190*4882a593Smuzhiyun p0 = tstbit(r6,#1) 191*4882a593Smuzhiyun if (!p0.new) jump:nt .L10 192*4882a593Smuzhiyun } 193*4882a593Smuzhiyun { 194*4882a593Smuzhiyun r2 = add(r2,#-2) 195*4882a593Smuzhiyun memh(r6++#2) = r7 196*4882a593Smuzhiyun p0 = cmp.eq(r2, #2) 197*4882a593Smuzhiyun if (p0.new) jump:nt .L1 198*4882a593Smuzhiyun } 199*4882a593Smuzhiyun.L10: 200*4882a593Smuzhiyun { 201*4882a593Smuzhiyun p0 = tstbit(r6,#2) 202*4882a593Smuzhiyun if (!p0.new) jump:nt .L12 203*4882a593Smuzhiyun } 204*4882a593Smuzhiyun { 205*4882a593Smuzhiyun r2 = add(r2,#-4) 206*4882a593Smuzhiyun memw(r6++#4) = r7 207*4882a593Smuzhiyun p0 = cmp.eq(r2, #4) 208*4882a593Smuzhiyun if (p0.new) jump:nt .L1 209*4882a593Smuzhiyun } 210*4882a593Smuzhiyun.L12: 211*4882a593Smuzhiyun { 212*4882a593Smuzhiyun p0 = cmp.gtu(r2,#127) 213*4882a593Smuzhiyun if (!p0.new) jump:nt .L14 214*4882a593Smuzhiyun } 215*4882a593Smuzhiyun r3 = and(r6,#31) 216*4882a593Smuzhiyun if (r3==#0) jump:nt .L17 217*4882a593Smuzhiyun { 218*4882a593Smuzhiyun memd(r6++#8) = r5:4 219*4882a593Smuzhiyun r2 = add(r2,#-8) 220*4882a593Smuzhiyun } 221*4882a593Smuzhiyun r3 = and(r6,#31) 222*4882a593Smuzhiyun if (r3==#0) jump:nt .L17 223*4882a593Smuzhiyun { 224*4882a593Smuzhiyun memd(r6++#8) = r5:4 225*4882a593Smuzhiyun r2 = add(r2,#-8) 226*4882a593Smuzhiyun } 227*4882a593Smuzhiyun r3 = and(r6,#31) 228*4882a593Smuzhiyun if (r3==#0) jump:nt .L17 229*4882a593Smuzhiyun { 230*4882a593Smuzhiyun memd(r6++#8) = r5:4 231*4882a593Smuzhiyun r2 = add(r2,#-8) 232*4882a593Smuzhiyun } 233*4882a593Smuzhiyun.L17: 234*4882a593Smuzhiyun { 235*4882a593Smuzhiyun r3 = lsr(r2,#5) 236*4882a593Smuzhiyun if (r1!=#0) jump:nt .L18 237*4882a593Smuzhiyun } 238*4882a593Smuzhiyun { 239*4882a593Smuzhiyun r8 = r3 240*4882a593Smuzhiyun r3 = r6 241*4882a593Smuzhiyun loop0(.L46,r3) 242*4882a593Smuzhiyun } 243*4882a593Smuzhiyun .falign 244*4882a593Smuzhiyun.L46: 245*4882a593Smuzhiyun { 246*4882a593Smuzhiyun dczeroa(r6) 247*4882a593Smuzhiyun r6 = add(r6,#32) 248*4882a593Smuzhiyun r2 = add(r2,#-32) 249*4882a593Smuzhiyun }:endloop0 /* start=.L46 */ 250*4882a593Smuzhiyun.L14: 251*4882a593Smuzhiyun { 252*4882a593Smuzhiyun p0 = cmp.gtu(r2,#7) 253*4882a593Smuzhiyun if (!p0.new) jump:nt .L28 254*4882a593Smuzhiyun r8 = lsr(r2,#3) 255*4882a593Smuzhiyun } 256*4882a593Smuzhiyun loop0(.L44,r8) 257*4882a593Smuzhiyun .falign 258*4882a593Smuzhiyun.L44: 259*4882a593Smuzhiyun { 260*4882a593Smuzhiyun memd(r6++#8) = r5:4 261*4882a593Smuzhiyun r2 = add(r2,#-8) 262*4882a593Smuzhiyun }:endloop0 /* start=.L44 */ 263*4882a593Smuzhiyun.L28: 264*4882a593Smuzhiyun { 265*4882a593Smuzhiyun p0 = tstbit(r2,#2) 266*4882a593Smuzhiyun if (!p0.new) jump:nt .L33 267*4882a593Smuzhiyun } 268*4882a593Smuzhiyun { 269*4882a593Smuzhiyun r2 = add(r2,#-4) 270*4882a593Smuzhiyun memw(r6++#4) = r7 271*4882a593Smuzhiyun } 272*4882a593Smuzhiyun.L33: 273*4882a593Smuzhiyun { 274*4882a593Smuzhiyun p0 = tstbit(r2,#1) 275*4882a593Smuzhiyun if (!p0.new) jump:nt .L35 276*4882a593Smuzhiyun } 277*4882a593Smuzhiyun { 278*4882a593Smuzhiyun r2 = add(r2,#-2) 279*4882a593Smuzhiyun memh(r6++#2) = r7 280*4882a593Smuzhiyun } 281*4882a593Smuzhiyun.L35: 282*4882a593Smuzhiyun p0 = cmp.eq(r2,#1) 283*4882a593Smuzhiyun if (p0) memb(r6) = r1 284*4882a593Smuzhiyun.L1: 285*4882a593Smuzhiyun jumpr r31 286*4882a593Smuzhiyun.L18: 287*4882a593Smuzhiyun loop0(.L45,r3) 288*4882a593Smuzhiyun .falign 289*4882a593Smuzhiyun.L45: 290*4882a593Smuzhiyun dczeroa(r6) 291*4882a593Smuzhiyun { 292*4882a593Smuzhiyun memd(r6++#8) = r5:4 293*4882a593Smuzhiyun r2 = add(r2,#-32) 294*4882a593Smuzhiyun } 295*4882a593Smuzhiyun memd(r6++#8) = r5:4 296*4882a593Smuzhiyun memd(r6++#8) = r5:4 297*4882a593Smuzhiyun { 298*4882a593Smuzhiyun memd(r6++#8) = r5:4 299*4882a593Smuzhiyun }:endloop0 /* start=.L45 */ 300*4882a593Smuzhiyun jump .L14 301*4882a593SmuzhiyunHEXAGON_OPT_FUNC_FINISH memset 302*4882a593Smuzhiyun#endif 303