1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (C) IBM Corporation, 2011 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * Author: Anton Blanchard <anton@au.ibm.com> 7*4882a593Smuzhiyun */ 8*4882a593Smuzhiyun#include <asm/ppc_asm.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#ifndef SELFTEST_CASE 11*4882a593Smuzhiyun/* 0 == don't use VMX, 1 == use VMX */ 12*4882a593Smuzhiyun#define SELFTEST_CASE 0 13*4882a593Smuzhiyun#endif 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__ 16*4882a593Smuzhiyun#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18*4882a593Smuzhiyun#else 19*4882a593Smuzhiyun#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21*4882a593Smuzhiyun#endif 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun .macro err1 24*4882a593Smuzhiyun100: 25*4882a593Smuzhiyun EX_TABLE(100b,.Ldo_err1) 26*4882a593Smuzhiyun .endm 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun .macro err2 29*4882a593Smuzhiyun200: 30*4882a593Smuzhiyun EX_TABLE(200b,.Ldo_err2) 31*4882a593Smuzhiyun .endm 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 34*4882a593Smuzhiyun .macro err3 35*4882a593Smuzhiyun300: 36*4882a593Smuzhiyun EX_TABLE(300b,.Ldo_err3) 37*4882a593Smuzhiyun .endm 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun .macro err4 40*4882a593Smuzhiyun400: 41*4882a593Smuzhiyun EX_TABLE(400b,.Ldo_err4) 42*4882a593Smuzhiyun .endm 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun.Ldo_err4: 46*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 47*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 48*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 49*4882a593Smuzhiyun.Ldo_err3: 50*4882a593Smuzhiyun bl exit_vmx_usercopy 51*4882a593Smuzhiyun ld r0,STACKFRAMESIZE+16(r1) 52*4882a593Smuzhiyun mtlr r0 53*4882a593Smuzhiyun b .Lexit 54*4882a593Smuzhiyun#endif /* CONFIG_ALTIVEC */ 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun.Ldo_err2: 57*4882a593Smuzhiyun ld r22,STK_REG(R22)(r1) 58*4882a593Smuzhiyun ld r21,STK_REG(R21)(r1) 59*4882a593Smuzhiyun ld r20,STK_REG(R20)(r1) 60*4882a593Smuzhiyun ld r19,STK_REG(R19)(r1) 61*4882a593Smuzhiyun ld r18,STK_REG(R18)(r1) 62*4882a593Smuzhiyun ld r17,STK_REG(R17)(r1) 63*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 64*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 65*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 66*4882a593Smuzhiyun.Lexit: 67*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE 68*4882a593Smuzhiyun.Ldo_err1: 69*4882a593Smuzhiyun ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 70*4882a593Smuzhiyun ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 71*4882a593Smuzhiyun ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 72*4882a593Smuzhiyun b __copy_tofrom_user_base 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun_GLOBAL(__copy_tofrom_user_power7) 76*4882a593Smuzhiyun cmpldi r5,16 77*4882a593Smuzhiyun cmpldi cr1,r5,3328 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 80*4882a593Smuzhiyun std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 81*4882a593Smuzhiyun std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun blt .Lshort_copy 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 86*4882a593Smuzhiyuntest_feature = SELFTEST_CASE 87*4882a593SmuzhiyunBEGIN_FTR_SECTION 88*4882a593Smuzhiyun bgt cr1,.Lvmx_copy 89*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 90*4882a593Smuzhiyun#endif 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun.Lnonvmx_copy: 93*4882a593Smuzhiyun /* Get the source 8B aligned */ 94*4882a593Smuzhiyun neg r6,r4 95*4882a593Smuzhiyun mtocrf 0x01,r6 96*4882a593Smuzhiyun clrldi r6,r6,(64-3) 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun bf cr7*4+3,1f 99*4882a593Smuzhiyunerr1; lbz r0,0(r4) 100*4882a593Smuzhiyun addi r4,r4,1 101*4882a593Smuzhiyunerr1; stb r0,0(r3) 102*4882a593Smuzhiyun addi r3,r3,1 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun1: bf cr7*4+2,2f 105*4882a593Smuzhiyunerr1; lhz r0,0(r4) 106*4882a593Smuzhiyun addi r4,r4,2 107*4882a593Smuzhiyunerr1; sth r0,0(r3) 108*4882a593Smuzhiyun addi r3,r3,2 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun2: bf cr7*4+1,3f 111*4882a593Smuzhiyunerr1; lwz r0,0(r4) 112*4882a593Smuzhiyun addi r4,r4,4 113*4882a593Smuzhiyunerr1; stw r0,0(r3) 114*4882a593Smuzhiyun addi r3,r3,4 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun3: sub r5,r5,r6 117*4882a593Smuzhiyun cmpldi r5,128 118*4882a593Smuzhiyun blt 5f 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun mflr r0 121*4882a593Smuzhiyun stdu r1,-STACKFRAMESIZE(r1) 122*4882a593Smuzhiyun std r14,STK_REG(R14)(r1) 123*4882a593Smuzhiyun std r15,STK_REG(R15)(r1) 124*4882a593Smuzhiyun std r16,STK_REG(R16)(r1) 125*4882a593Smuzhiyun std r17,STK_REG(R17)(r1) 126*4882a593Smuzhiyun std r18,STK_REG(R18)(r1) 127*4882a593Smuzhiyun std r19,STK_REG(R19)(r1) 128*4882a593Smuzhiyun std r20,STK_REG(R20)(r1) 129*4882a593Smuzhiyun std r21,STK_REG(R21)(r1) 130*4882a593Smuzhiyun std r22,STK_REG(R22)(r1) 131*4882a593Smuzhiyun std r0,STACKFRAMESIZE+16(r1) 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun srdi r6,r5,7 134*4882a593Smuzhiyun mtctr r6 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun /* Now do cacheline (128B) sized loads and stores. */ 137*4882a593Smuzhiyun .align 5 138*4882a593Smuzhiyun4: 139*4882a593Smuzhiyunerr2; ld r0,0(r4) 140*4882a593Smuzhiyunerr2; ld r6,8(r4) 141*4882a593Smuzhiyunerr2; ld r7,16(r4) 142*4882a593Smuzhiyunerr2; ld r8,24(r4) 143*4882a593Smuzhiyunerr2; ld r9,32(r4) 144*4882a593Smuzhiyunerr2; ld r10,40(r4) 145*4882a593Smuzhiyunerr2; ld r11,48(r4) 146*4882a593Smuzhiyunerr2; ld r12,56(r4) 147*4882a593Smuzhiyunerr2; ld r14,64(r4) 148*4882a593Smuzhiyunerr2; ld r15,72(r4) 149*4882a593Smuzhiyunerr2; ld r16,80(r4) 150*4882a593Smuzhiyunerr2; ld r17,88(r4) 151*4882a593Smuzhiyunerr2; ld r18,96(r4) 152*4882a593Smuzhiyunerr2; ld r19,104(r4) 153*4882a593Smuzhiyunerr2; ld r20,112(r4) 154*4882a593Smuzhiyunerr2; ld r21,120(r4) 155*4882a593Smuzhiyun addi r4,r4,128 156*4882a593Smuzhiyunerr2; std r0,0(r3) 157*4882a593Smuzhiyunerr2; std r6,8(r3) 158*4882a593Smuzhiyunerr2; std r7,16(r3) 159*4882a593Smuzhiyunerr2; std r8,24(r3) 160*4882a593Smuzhiyunerr2; std r9,32(r3) 161*4882a593Smuzhiyunerr2; std r10,40(r3) 162*4882a593Smuzhiyunerr2; std r11,48(r3) 163*4882a593Smuzhiyunerr2; std r12,56(r3) 164*4882a593Smuzhiyunerr2; std r14,64(r3) 165*4882a593Smuzhiyunerr2; std r15,72(r3) 166*4882a593Smuzhiyunerr2; std r16,80(r3) 167*4882a593Smuzhiyunerr2; std r17,88(r3) 168*4882a593Smuzhiyunerr2; std r18,96(r3) 169*4882a593Smuzhiyunerr2; std r19,104(r3) 170*4882a593Smuzhiyunerr2; std r20,112(r3) 171*4882a593Smuzhiyunerr2; std r21,120(r3) 172*4882a593Smuzhiyun addi r3,r3,128 173*4882a593Smuzhiyun bdnz 4b 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun clrldi r5,r5,(64-7) 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 178*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 179*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 180*4882a593Smuzhiyun ld r17,STK_REG(R17)(r1) 181*4882a593Smuzhiyun ld r18,STK_REG(R18)(r1) 182*4882a593Smuzhiyun ld r19,STK_REG(R19)(r1) 183*4882a593Smuzhiyun ld r20,STK_REG(R20)(r1) 184*4882a593Smuzhiyun ld r21,STK_REG(R21)(r1) 185*4882a593Smuzhiyun ld r22,STK_REG(R22)(r1) 186*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun /* Up to 127B to go */ 189*4882a593Smuzhiyun5: srdi r6,r5,4 190*4882a593Smuzhiyun mtocrf 0x01,r6 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun6: bf cr7*4+1,7f 193*4882a593Smuzhiyunerr1; ld r0,0(r4) 194*4882a593Smuzhiyunerr1; ld r6,8(r4) 195*4882a593Smuzhiyunerr1; ld r7,16(r4) 196*4882a593Smuzhiyunerr1; ld r8,24(r4) 197*4882a593Smuzhiyunerr1; ld r9,32(r4) 198*4882a593Smuzhiyunerr1; ld r10,40(r4) 199*4882a593Smuzhiyunerr1; ld r11,48(r4) 200*4882a593Smuzhiyunerr1; ld r12,56(r4) 201*4882a593Smuzhiyun addi r4,r4,64 202*4882a593Smuzhiyunerr1; std r0,0(r3) 203*4882a593Smuzhiyunerr1; std r6,8(r3) 204*4882a593Smuzhiyunerr1; std r7,16(r3) 205*4882a593Smuzhiyunerr1; std r8,24(r3) 206*4882a593Smuzhiyunerr1; std r9,32(r3) 207*4882a593Smuzhiyunerr1; std r10,40(r3) 208*4882a593Smuzhiyunerr1; std r11,48(r3) 209*4882a593Smuzhiyunerr1; std r12,56(r3) 210*4882a593Smuzhiyun addi r3,r3,64 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun /* Up to 63B to go */ 213*4882a593Smuzhiyun7: bf cr7*4+2,8f 214*4882a593Smuzhiyunerr1; ld r0,0(r4) 215*4882a593Smuzhiyunerr1; ld r6,8(r4) 216*4882a593Smuzhiyunerr1; ld r7,16(r4) 217*4882a593Smuzhiyunerr1; ld r8,24(r4) 218*4882a593Smuzhiyun addi r4,r4,32 219*4882a593Smuzhiyunerr1; std r0,0(r3) 220*4882a593Smuzhiyunerr1; std r6,8(r3) 221*4882a593Smuzhiyunerr1; std r7,16(r3) 222*4882a593Smuzhiyunerr1; std r8,24(r3) 223*4882a593Smuzhiyun addi r3,r3,32 224*4882a593Smuzhiyun 225*4882a593Smuzhiyun /* Up to 31B to go */ 226*4882a593Smuzhiyun8: bf cr7*4+3,9f 227*4882a593Smuzhiyunerr1; ld r0,0(r4) 228*4882a593Smuzhiyunerr1; ld r6,8(r4) 229*4882a593Smuzhiyun addi r4,r4,16 230*4882a593Smuzhiyunerr1; std r0,0(r3) 231*4882a593Smuzhiyunerr1; std r6,8(r3) 232*4882a593Smuzhiyun addi r3,r3,16 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun9: clrldi r5,r5,(64-4) 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun /* Up to 15B to go */ 237*4882a593Smuzhiyun.Lshort_copy: 238*4882a593Smuzhiyun mtocrf 0x01,r5 239*4882a593Smuzhiyun bf cr7*4+0,12f 240*4882a593Smuzhiyunerr1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 241*4882a593Smuzhiyunerr1; lwz r6,4(r4) 242*4882a593Smuzhiyun addi r4,r4,8 243*4882a593Smuzhiyunerr1; stw r0,0(r3) 244*4882a593Smuzhiyunerr1; stw r6,4(r3) 245*4882a593Smuzhiyun addi r3,r3,8 246*4882a593Smuzhiyun 247*4882a593Smuzhiyun12: bf cr7*4+1,13f 248*4882a593Smuzhiyunerr1; lwz r0,0(r4) 249*4882a593Smuzhiyun addi r4,r4,4 250*4882a593Smuzhiyunerr1; stw r0,0(r3) 251*4882a593Smuzhiyun addi r3,r3,4 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun13: bf cr7*4+2,14f 254*4882a593Smuzhiyunerr1; lhz r0,0(r4) 255*4882a593Smuzhiyun addi r4,r4,2 256*4882a593Smuzhiyunerr1; sth r0,0(r3) 257*4882a593Smuzhiyun addi r3,r3,2 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun14: bf cr7*4+3,15f 260*4882a593Smuzhiyunerr1; lbz r0,0(r4) 261*4882a593Smuzhiyunerr1; stb r0,0(r3) 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun15: li r3,0 264*4882a593Smuzhiyun blr 265*4882a593Smuzhiyun 266*4882a593Smuzhiyun.Lunwind_stack_nonvmx_copy: 267*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE 268*4882a593Smuzhiyun b .Lnonvmx_copy 269*4882a593Smuzhiyun 270*4882a593Smuzhiyun.Lvmx_copy: 271*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 272*4882a593Smuzhiyun mflr r0 273*4882a593Smuzhiyun std r0,16(r1) 274*4882a593Smuzhiyun stdu r1,-STACKFRAMESIZE(r1) 275*4882a593Smuzhiyun bl enter_vmx_usercopy 276*4882a593Smuzhiyun cmpwi cr1,r3,0 277*4882a593Smuzhiyun ld r0,STACKFRAMESIZE+16(r1) 278*4882a593Smuzhiyun ld r3,STK_REG(R31)(r1) 279*4882a593Smuzhiyun ld r4,STK_REG(R30)(r1) 280*4882a593Smuzhiyun ld r5,STK_REG(R29)(r1) 281*4882a593Smuzhiyun mtlr r0 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun /* 284*4882a593Smuzhiyun * We prefetch both the source and destination using enhanced touch 285*4882a593Smuzhiyun * instructions. We use a stream ID of 0 for the load side and 286*4882a593Smuzhiyun * 1 for the store side. 287*4882a593Smuzhiyun */ 288*4882a593Smuzhiyun clrrdi r6,r4,7 289*4882a593Smuzhiyun clrrdi r9,r3,7 290*4882a593Smuzhiyun ori r9,r9,1 /* stream=1 */ 291*4882a593Smuzhiyun 292*4882a593Smuzhiyun srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 293*4882a593Smuzhiyun cmpldi r7,0x3FF 294*4882a593Smuzhiyun ble 1f 295*4882a593Smuzhiyun li r7,0x3FF 296*4882a593Smuzhiyun1: lis r0,0x0E00 /* depth=7 */ 297*4882a593Smuzhiyun sldi r7,r7,7 298*4882a593Smuzhiyun or r7,r7,r0 299*4882a593Smuzhiyun ori r10,r7,1 /* stream=1 */ 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun lis r8,0x8000 /* GO=1 */ 302*4882a593Smuzhiyun clrldi r8,r8,32 303*4882a593Smuzhiyun 304*4882a593Smuzhiyun /* setup read stream 0 */ 305*4882a593Smuzhiyun dcbt 0,r6,0b01000 /* addr from */ 306*4882a593Smuzhiyun dcbt 0,r7,0b01010 /* length and depth from */ 307*4882a593Smuzhiyun /* setup write stream 1 */ 308*4882a593Smuzhiyun dcbtst 0,r9,0b01000 /* addr to */ 309*4882a593Smuzhiyun dcbtst 0,r10,0b01010 /* length and depth to */ 310*4882a593Smuzhiyun eieio 311*4882a593Smuzhiyun dcbt 0,r8,0b01010 /* all streams GO */ 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun beq cr1,.Lunwind_stack_nonvmx_copy 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun /* 316*4882a593Smuzhiyun * If source and destination are not relatively aligned we use a 317*4882a593Smuzhiyun * slower permute loop. 318*4882a593Smuzhiyun */ 319*4882a593Smuzhiyun xor r6,r4,r3 320*4882a593Smuzhiyun rldicl. r6,r6,0,(64-4) 321*4882a593Smuzhiyun bne .Lvmx_unaligned_copy 322*4882a593Smuzhiyun 323*4882a593Smuzhiyun /* Get the destination 16B aligned */ 324*4882a593Smuzhiyun neg r6,r3 325*4882a593Smuzhiyun mtocrf 0x01,r6 326*4882a593Smuzhiyun clrldi r6,r6,(64-4) 327*4882a593Smuzhiyun 328*4882a593Smuzhiyun bf cr7*4+3,1f 329*4882a593Smuzhiyunerr3; lbz r0,0(r4) 330*4882a593Smuzhiyun addi r4,r4,1 331*4882a593Smuzhiyunerr3; stb r0,0(r3) 332*4882a593Smuzhiyun addi r3,r3,1 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun1: bf cr7*4+2,2f 335*4882a593Smuzhiyunerr3; lhz r0,0(r4) 336*4882a593Smuzhiyun addi r4,r4,2 337*4882a593Smuzhiyunerr3; sth r0,0(r3) 338*4882a593Smuzhiyun addi r3,r3,2 339*4882a593Smuzhiyun 340*4882a593Smuzhiyun2: bf cr7*4+1,3f 341*4882a593Smuzhiyunerr3; lwz r0,0(r4) 342*4882a593Smuzhiyun addi r4,r4,4 343*4882a593Smuzhiyunerr3; stw r0,0(r3) 344*4882a593Smuzhiyun addi r3,r3,4 345*4882a593Smuzhiyun 346*4882a593Smuzhiyun3: bf cr7*4+0,4f 347*4882a593Smuzhiyunerr3; ld r0,0(r4) 348*4882a593Smuzhiyun addi r4,r4,8 349*4882a593Smuzhiyunerr3; std r0,0(r3) 350*4882a593Smuzhiyun addi r3,r3,8 351*4882a593Smuzhiyun 352*4882a593Smuzhiyun4: sub r5,r5,r6 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun /* Get the desination 128B aligned */ 355*4882a593Smuzhiyun neg r6,r3 356*4882a593Smuzhiyun srdi r7,r6,4 357*4882a593Smuzhiyun mtocrf 0x01,r7 358*4882a593Smuzhiyun clrldi r6,r6,(64-7) 359*4882a593Smuzhiyun 360*4882a593Smuzhiyun li r9,16 361*4882a593Smuzhiyun li r10,32 362*4882a593Smuzhiyun li r11,48 363*4882a593Smuzhiyun 364*4882a593Smuzhiyun bf cr7*4+3,5f 365*4882a593Smuzhiyunerr3; lvx v1,0,r4 366*4882a593Smuzhiyun addi r4,r4,16 367*4882a593Smuzhiyunerr3; stvx v1,0,r3 368*4882a593Smuzhiyun addi r3,r3,16 369*4882a593Smuzhiyun 370*4882a593Smuzhiyun5: bf cr7*4+2,6f 371*4882a593Smuzhiyunerr3; lvx v1,0,r4 372*4882a593Smuzhiyunerr3; lvx v0,r4,r9 373*4882a593Smuzhiyun addi r4,r4,32 374*4882a593Smuzhiyunerr3; stvx v1,0,r3 375*4882a593Smuzhiyunerr3; stvx v0,r3,r9 376*4882a593Smuzhiyun addi r3,r3,32 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun6: bf cr7*4+1,7f 379*4882a593Smuzhiyunerr3; lvx v3,0,r4 380*4882a593Smuzhiyunerr3; lvx v2,r4,r9 381*4882a593Smuzhiyunerr3; lvx v1,r4,r10 382*4882a593Smuzhiyunerr3; lvx v0,r4,r11 383*4882a593Smuzhiyun addi r4,r4,64 384*4882a593Smuzhiyunerr3; stvx v3,0,r3 385*4882a593Smuzhiyunerr3; stvx v2,r3,r9 386*4882a593Smuzhiyunerr3; stvx v1,r3,r10 387*4882a593Smuzhiyunerr3; stvx v0,r3,r11 388*4882a593Smuzhiyun addi r3,r3,64 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun7: sub r5,r5,r6 391*4882a593Smuzhiyun srdi r6,r5,7 392*4882a593Smuzhiyun 393*4882a593Smuzhiyun std r14,STK_REG(R14)(r1) 394*4882a593Smuzhiyun std r15,STK_REG(R15)(r1) 395*4882a593Smuzhiyun std r16,STK_REG(R16)(r1) 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun li r12,64 398*4882a593Smuzhiyun li r14,80 399*4882a593Smuzhiyun li r15,96 400*4882a593Smuzhiyun li r16,112 401*4882a593Smuzhiyun 402*4882a593Smuzhiyun mtctr r6 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun /* 405*4882a593Smuzhiyun * Now do cacheline sized loads and stores. By this stage the 406*4882a593Smuzhiyun * cacheline stores are also cacheline aligned. 407*4882a593Smuzhiyun */ 408*4882a593Smuzhiyun .align 5 409*4882a593Smuzhiyun8: 410*4882a593Smuzhiyunerr4; lvx v7,0,r4 411*4882a593Smuzhiyunerr4; lvx v6,r4,r9 412*4882a593Smuzhiyunerr4; lvx v5,r4,r10 413*4882a593Smuzhiyunerr4; lvx v4,r4,r11 414*4882a593Smuzhiyunerr4; lvx v3,r4,r12 415*4882a593Smuzhiyunerr4; lvx v2,r4,r14 416*4882a593Smuzhiyunerr4; lvx v1,r4,r15 417*4882a593Smuzhiyunerr4; lvx v0,r4,r16 418*4882a593Smuzhiyun addi r4,r4,128 419*4882a593Smuzhiyunerr4; stvx v7,0,r3 420*4882a593Smuzhiyunerr4; stvx v6,r3,r9 421*4882a593Smuzhiyunerr4; stvx v5,r3,r10 422*4882a593Smuzhiyunerr4; stvx v4,r3,r11 423*4882a593Smuzhiyunerr4; stvx v3,r3,r12 424*4882a593Smuzhiyunerr4; stvx v2,r3,r14 425*4882a593Smuzhiyunerr4; stvx v1,r3,r15 426*4882a593Smuzhiyunerr4; stvx v0,r3,r16 427*4882a593Smuzhiyun addi r3,r3,128 428*4882a593Smuzhiyun bdnz 8b 429*4882a593Smuzhiyun 430*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 431*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 432*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 433*4882a593Smuzhiyun 434*4882a593Smuzhiyun /* Up to 127B to go */ 435*4882a593Smuzhiyun clrldi r5,r5,(64-7) 436*4882a593Smuzhiyun srdi r6,r5,4 437*4882a593Smuzhiyun mtocrf 0x01,r6 438*4882a593Smuzhiyun 439*4882a593Smuzhiyun bf cr7*4+1,9f 440*4882a593Smuzhiyunerr3; lvx v3,0,r4 441*4882a593Smuzhiyunerr3; lvx v2,r4,r9 442*4882a593Smuzhiyunerr3; lvx v1,r4,r10 443*4882a593Smuzhiyunerr3; lvx v0,r4,r11 444*4882a593Smuzhiyun addi r4,r4,64 445*4882a593Smuzhiyunerr3; stvx v3,0,r3 446*4882a593Smuzhiyunerr3; stvx v2,r3,r9 447*4882a593Smuzhiyunerr3; stvx v1,r3,r10 448*4882a593Smuzhiyunerr3; stvx v0,r3,r11 449*4882a593Smuzhiyun addi r3,r3,64 450*4882a593Smuzhiyun 451*4882a593Smuzhiyun9: bf cr7*4+2,10f 452*4882a593Smuzhiyunerr3; lvx v1,0,r4 453*4882a593Smuzhiyunerr3; lvx v0,r4,r9 454*4882a593Smuzhiyun addi r4,r4,32 455*4882a593Smuzhiyunerr3; stvx v1,0,r3 456*4882a593Smuzhiyunerr3; stvx v0,r3,r9 457*4882a593Smuzhiyun addi r3,r3,32 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun10: bf cr7*4+3,11f 460*4882a593Smuzhiyunerr3; lvx v1,0,r4 461*4882a593Smuzhiyun addi r4,r4,16 462*4882a593Smuzhiyunerr3; stvx v1,0,r3 463*4882a593Smuzhiyun addi r3,r3,16 464*4882a593Smuzhiyun 465*4882a593Smuzhiyun /* Up to 15B to go */ 466*4882a593Smuzhiyun11: clrldi r5,r5,(64-4) 467*4882a593Smuzhiyun mtocrf 0x01,r5 468*4882a593Smuzhiyun bf cr7*4+0,12f 469*4882a593Smuzhiyunerr3; ld r0,0(r4) 470*4882a593Smuzhiyun addi r4,r4,8 471*4882a593Smuzhiyunerr3; std r0,0(r3) 472*4882a593Smuzhiyun addi r3,r3,8 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun12: bf cr7*4+1,13f 475*4882a593Smuzhiyunerr3; lwz r0,0(r4) 476*4882a593Smuzhiyun addi r4,r4,4 477*4882a593Smuzhiyunerr3; stw r0,0(r3) 478*4882a593Smuzhiyun addi r3,r3,4 479*4882a593Smuzhiyun 480*4882a593Smuzhiyun13: bf cr7*4+2,14f 481*4882a593Smuzhiyunerr3; lhz r0,0(r4) 482*4882a593Smuzhiyun addi r4,r4,2 483*4882a593Smuzhiyunerr3; sth r0,0(r3) 484*4882a593Smuzhiyun addi r3,r3,2 485*4882a593Smuzhiyun 486*4882a593Smuzhiyun14: bf cr7*4+3,15f 487*4882a593Smuzhiyunerr3; lbz r0,0(r4) 488*4882a593Smuzhiyunerr3; stb r0,0(r3) 489*4882a593Smuzhiyun 490*4882a593Smuzhiyun15: addi r1,r1,STACKFRAMESIZE 491*4882a593Smuzhiyun b exit_vmx_usercopy /* tail call optimise */ 492*4882a593Smuzhiyun 493*4882a593Smuzhiyun.Lvmx_unaligned_copy: 494*4882a593Smuzhiyun /* Get the destination 16B aligned */ 495*4882a593Smuzhiyun neg r6,r3 496*4882a593Smuzhiyun mtocrf 0x01,r6 497*4882a593Smuzhiyun clrldi r6,r6,(64-4) 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun bf cr7*4+3,1f 500*4882a593Smuzhiyunerr3; lbz r0,0(r4) 501*4882a593Smuzhiyun addi r4,r4,1 502*4882a593Smuzhiyunerr3; stb r0,0(r3) 503*4882a593Smuzhiyun addi r3,r3,1 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun1: bf cr7*4+2,2f 506*4882a593Smuzhiyunerr3; lhz r0,0(r4) 507*4882a593Smuzhiyun addi r4,r4,2 508*4882a593Smuzhiyunerr3; sth r0,0(r3) 509*4882a593Smuzhiyun addi r3,r3,2 510*4882a593Smuzhiyun 511*4882a593Smuzhiyun2: bf cr7*4+1,3f 512*4882a593Smuzhiyunerr3; lwz r0,0(r4) 513*4882a593Smuzhiyun addi r4,r4,4 514*4882a593Smuzhiyunerr3; stw r0,0(r3) 515*4882a593Smuzhiyun addi r3,r3,4 516*4882a593Smuzhiyun 517*4882a593Smuzhiyun3: bf cr7*4+0,4f 518*4882a593Smuzhiyunerr3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 519*4882a593Smuzhiyunerr3; lwz r7,4(r4) 520*4882a593Smuzhiyun addi r4,r4,8 521*4882a593Smuzhiyunerr3; stw r0,0(r3) 522*4882a593Smuzhiyunerr3; stw r7,4(r3) 523*4882a593Smuzhiyun addi r3,r3,8 524*4882a593Smuzhiyun 525*4882a593Smuzhiyun4: sub r5,r5,r6 526*4882a593Smuzhiyun 527*4882a593Smuzhiyun /* Get the desination 128B aligned */ 528*4882a593Smuzhiyun neg r6,r3 529*4882a593Smuzhiyun srdi r7,r6,4 530*4882a593Smuzhiyun mtocrf 0x01,r7 531*4882a593Smuzhiyun clrldi r6,r6,(64-7) 532*4882a593Smuzhiyun 533*4882a593Smuzhiyun li r9,16 534*4882a593Smuzhiyun li r10,32 535*4882a593Smuzhiyun li r11,48 536*4882a593Smuzhiyun 537*4882a593Smuzhiyun LVS(v16,0,r4) /* Setup permute control vector */ 538*4882a593Smuzhiyunerr3; lvx v0,0,r4 539*4882a593Smuzhiyun addi r4,r4,16 540*4882a593Smuzhiyun 541*4882a593Smuzhiyun bf cr7*4+3,5f 542*4882a593Smuzhiyunerr3; lvx v1,0,r4 543*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 544*4882a593Smuzhiyun addi r4,r4,16 545*4882a593Smuzhiyunerr3; stvx v8,0,r3 546*4882a593Smuzhiyun addi r3,r3,16 547*4882a593Smuzhiyun vor v0,v1,v1 548*4882a593Smuzhiyun 549*4882a593Smuzhiyun5: bf cr7*4+2,6f 550*4882a593Smuzhiyunerr3; lvx v1,0,r4 551*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 552*4882a593Smuzhiyunerr3; lvx v0,r4,r9 553*4882a593Smuzhiyun VPERM(v9,v1,v0,v16) 554*4882a593Smuzhiyun addi r4,r4,32 555*4882a593Smuzhiyunerr3; stvx v8,0,r3 556*4882a593Smuzhiyunerr3; stvx v9,r3,r9 557*4882a593Smuzhiyun addi r3,r3,32 558*4882a593Smuzhiyun 559*4882a593Smuzhiyun6: bf cr7*4+1,7f 560*4882a593Smuzhiyunerr3; lvx v3,0,r4 561*4882a593Smuzhiyun VPERM(v8,v0,v3,v16) 562*4882a593Smuzhiyunerr3; lvx v2,r4,r9 563*4882a593Smuzhiyun VPERM(v9,v3,v2,v16) 564*4882a593Smuzhiyunerr3; lvx v1,r4,r10 565*4882a593Smuzhiyun VPERM(v10,v2,v1,v16) 566*4882a593Smuzhiyunerr3; lvx v0,r4,r11 567*4882a593Smuzhiyun VPERM(v11,v1,v0,v16) 568*4882a593Smuzhiyun addi r4,r4,64 569*4882a593Smuzhiyunerr3; stvx v8,0,r3 570*4882a593Smuzhiyunerr3; stvx v9,r3,r9 571*4882a593Smuzhiyunerr3; stvx v10,r3,r10 572*4882a593Smuzhiyunerr3; stvx v11,r3,r11 573*4882a593Smuzhiyun addi r3,r3,64 574*4882a593Smuzhiyun 575*4882a593Smuzhiyun7: sub r5,r5,r6 576*4882a593Smuzhiyun srdi r6,r5,7 577*4882a593Smuzhiyun 578*4882a593Smuzhiyun std r14,STK_REG(R14)(r1) 579*4882a593Smuzhiyun std r15,STK_REG(R15)(r1) 580*4882a593Smuzhiyun std r16,STK_REG(R16)(r1) 581*4882a593Smuzhiyun 582*4882a593Smuzhiyun li r12,64 583*4882a593Smuzhiyun li r14,80 584*4882a593Smuzhiyun li r15,96 585*4882a593Smuzhiyun li r16,112 586*4882a593Smuzhiyun 587*4882a593Smuzhiyun mtctr r6 588*4882a593Smuzhiyun 589*4882a593Smuzhiyun /* 590*4882a593Smuzhiyun * Now do cacheline sized loads and stores. By this stage the 591*4882a593Smuzhiyun * cacheline stores are also cacheline aligned. 592*4882a593Smuzhiyun */ 593*4882a593Smuzhiyun .align 5 594*4882a593Smuzhiyun8: 595*4882a593Smuzhiyunerr4; lvx v7,0,r4 596*4882a593Smuzhiyun VPERM(v8,v0,v7,v16) 597*4882a593Smuzhiyunerr4; lvx v6,r4,r9 598*4882a593Smuzhiyun VPERM(v9,v7,v6,v16) 599*4882a593Smuzhiyunerr4; lvx v5,r4,r10 600*4882a593Smuzhiyun VPERM(v10,v6,v5,v16) 601*4882a593Smuzhiyunerr4; lvx v4,r4,r11 602*4882a593Smuzhiyun VPERM(v11,v5,v4,v16) 603*4882a593Smuzhiyunerr4; lvx v3,r4,r12 604*4882a593Smuzhiyun VPERM(v12,v4,v3,v16) 605*4882a593Smuzhiyunerr4; lvx v2,r4,r14 606*4882a593Smuzhiyun VPERM(v13,v3,v2,v16) 607*4882a593Smuzhiyunerr4; lvx v1,r4,r15 608*4882a593Smuzhiyun VPERM(v14,v2,v1,v16) 609*4882a593Smuzhiyunerr4; lvx v0,r4,r16 610*4882a593Smuzhiyun VPERM(v15,v1,v0,v16) 611*4882a593Smuzhiyun addi r4,r4,128 612*4882a593Smuzhiyunerr4; stvx v8,0,r3 613*4882a593Smuzhiyunerr4; stvx v9,r3,r9 614*4882a593Smuzhiyunerr4; stvx v10,r3,r10 615*4882a593Smuzhiyunerr4; stvx v11,r3,r11 616*4882a593Smuzhiyunerr4; stvx v12,r3,r12 617*4882a593Smuzhiyunerr4; stvx v13,r3,r14 618*4882a593Smuzhiyunerr4; stvx v14,r3,r15 619*4882a593Smuzhiyunerr4; stvx v15,r3,r16 620*4882a593Smuzhiyun addi r3,r3,128 621*4882a593Smuzhiyun bdnz 8b 622*4882a593Smuzhiyun 623*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 624*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 625*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 626*4882a593Smuzhiyun 627*4882a593Smuzhiyun /* Up to 127B to go */ 628*4882a593Smuzhiyun clrldi r5,r5,(64-7) 629*4882a593Smuzhiyun srdi r6,r5,4 630*4882a593Smuzhiyun mtocrf 0x01,r6 631*4882a593Smuzhiyun 632*4882a593Smuzhiyun bf cr7*4+1,9f 633*4882a593Smuzhiyunerr3; lvx v3,0,r4 634*4882a593Smuzhiyun VPERM(v8,v0,v3,v16) 635*4882a593Smuzhiyunerr3; lvx v2,r4,r9 636*4882a593Smuzhiyun VPERM(v9,v3,v2,v16) 637*4882a593Smuzhiyunerr3; lvx v1,r4,r10 638*4882a593Smuzhiyun VPERM(v10,v2,v1,v16) 639*4882a593Smuzhiyunerr3; lvx v0,r4,r11 640*4882a593Smuzhiyun VPERM(v11,v1,v0,v16) 641*4882a593Smuzhiyun addi r4,r4,64 642*4882a593Smuzhiyunerr3; stvx v8,0,r3 643*4882a593Smuzhiyunerr3; stvx v9,r3,r9 644*4882a593Smuzhiyunerr3; stvx v10,r3,r10 645*4882a593Smuzhiyunerr3; stvx v11,r3,r11 646*4882a593Smuzhiyun addi r3,r3,64 647*4882a593Smuzhiyun 648*4882a593Smuzhiyun9: bf cr7*4+2,10f 649*4882a593Smuzhiyunerr3; lvx v1,0,r4 650*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 651*4882a593Smuzhiyunerr3; lvx v0,r4,r9 652*4882a593Smuzhiyun VPERM(v9,v1,v0,v16) 653*4882a593Smuzhiyun addi r4,r4,32 654*4882a593Smuzhiyunerr3; stvx v8,0,r3 655*4882a593Smuzhiyunerr3; stvx v9,r3,r9 656*4882a593Smuzhiyun addi r3,r3,32 657*4882a593Smuzhiyun 658*4882a593Smuzhiyun10: bf cr7*4+3,11f 659*4882a593Smuzhiyunerr3; lvx v1,0,r4 660*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 661*4882a593Smuzhiyun addi r4,r4,16 662*4882a593Smuzhiyunerr3; stvx v8,0,r3 663*4882a593Smuzhiyun addi r3,r3,16 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun /* Up to 15B to go */ 666*4882a593Smuzhiyun11: clrldi r5,r5,(64-4) 667*4882a593Smuzhiyun addi r4,r4,-16 /* Unwind the +16 load offset */ 668*4882a593Smuzhiyun mtocrf 0x01,r5 669*4882a593Smuzhiyun bf cr7*4+0,12f 670*4882a593Smuzhiyunerr3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 671*4882a593Smuzhiyunerr3; lwz r6,4(r4) 672*4882a593Smuzhiyun addi r4,r4,8 673*4882a593Smuzhiyunerr3; stw r0,0(r3) 674*4882a593Smuzhiyunerr3; stw r6,4(r3) 675*4882a593Smuzhiyun addi r3,r3,8 676*4882a593Smuzhiyun 677*4882a593Smuzhiyun12: bf cr7*4+1,13f 678*4882a593Smuzhiyunerr3; lwz r0,0(r4) 679*4882a593Smuzhiyun addi r4,r4,4 680*4882a593Smuzhiyunerr3; stw r0,0(r3) 681*4882a593Smuzhiyun addi r3,r3,4 682*4882a593Smuzhiyun 683*4882a593Smuzhiyun13: bf cr7*4+2,14f 684*4882a593Smuzhiyunerr3; lhz r0,0(r4) 685*4882a593Smuzhiyun addi r4,r4,2 686*4882a593Smuzhiyunerr3; sth r0,0(r3) 687*4882a593Smuzhiyun addi r3,r3,2 688*4882a593Smuzhiyun 689*4882a593Smuzhiyun14: bf cr7*4+3,15f 690*4882a593Smuzhiyunerr3; lbz r0,0(r4) 691*4882a593Smuzhiyunerr3; stb r0,0(r3) 692*4882a593Smuzhiyun 693*4882a593Smuzhiyun15: addi r1,r1,STACKFRAMESIZE 694*4882a593Smuzhiyun b exit_vmx_usercopy /* tail call optimise */ 695*4882a593Smuzhiyun#endif /* CONFIG_ALTIVEC */ 696