1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (C) IBM Corporation, 2012 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * Author: Anton Blanchard <anton@au.ibm.com> 7*4882a593Smuzhiyun */ 8*4882a593Smuzhiyun#include <asm/ppc_asm.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#ifndef SELFTEST_CASE 11*4882a593Smuzhiyun/* 0 == don't use VMX, 1 == use VMX */ 12*4882a593Smuzhiyun#define SELFTEST_CASE 0 13*4882a593Smuzhiyun#endif 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__ 16*4882a593Smuzhiyun#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18*4882a593Smuzhiyun#else 19*4882a593Smuzhiyun#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20*4882a593Smuzhiyun#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21*4882a593Smuzhiyun#endif 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun_GLOBAL(memcpy_power7) 24*4882a593Smuzhiyun cmpldi r5,16 25*4882a593Smuzhiyun cmpldi cr1,r5,4096 26*4882a593Smuzhiyun std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 27*4882a593Smuzhiyun blt .Lshort_copy 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 30*4882a593Smuzhiyuntest_feature = SELFTEST_CASE 31*4882a593SmuzhiyunBEGIN_FTR_SECTION 32*4882a593Smuzhiyun bgt cr1, .Lvmx_copy 33*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 34*4882a593Smuzhiyun#endif 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun.Lnonvmx_copy: 37*4882a593Smuzhiyun /* Get the source 8B aligned */ 38*4882a593Smuzhiyun neg r6,r4 39*4882a593Smuzhiyun mtocrf 0x01,r6 40*4882a593Smuzhiyun clrldi r6,r6,(64-3) 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun bf cr7*4+3,1f 43*4882a593Smuzhiyun lbz r0,0(r4) 44*4882a593Smuzhiyun addi r4,r4,1 45*4882a593Smuzhiyun stb r0,0(r3) 46*4882a593Smuzhiyun addi r3,r3,1 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun1: bf cr7*4+2,2f 49*4882a593Smuzhiyun lhz r0,0(r4) 50*4882a593Smuzhiyun addi r4,r4,2 51*4882a593Smuzhiyun sth r0,0(r3) 52*4882a593Smuzhiyun addi r3,r3,2 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun2: bf cr7*4+1,3f 55*4882a593Smuzhiyun lwz r0,0(r4) 56*4882a593Smuzhiyun addi r4,r4,4 57*4882a593Smuzhiyun stw r0,0(r3) 58*4882a593Smuzhiyun addi r3,r3,4 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun3: sub r5,r5,r6 61*4882a593Smuzhiyun cmpldi r5,128 62*4882a593Smuzhiyun blt 5f 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun mflr r0 65*4882a593Smuzhiyun stdu r1,-STACKFRAMESIZE(r1) 66*4882a593Smuzhiyun std r14,STK_REG(R14)(r1) 67*4882a593Smuzhiyun std r15,STK_REG(R15)(r1) 68*4882a593Smuzhiyun std r16,STK_REG(R16)(r1) 69*4882a593Smuzhiyun std r17,STK_REG(R17)(r1) 70*4882a593Smuzhiyun std r18,STK_REG(R18)(r1) 71*4882a593Smuzhiyun std r19,STK_REG(R19)(r1) 72*4882a593Smuzhiyun std r20,STK_REG(R20)(r1) 73*4882a593Smuzhiyun std r21,STK_REG(R21)(r1) 74*4882a593Smuzhiyun std r22,STK_REG(R22)(r1) 75*4882a593Smuzhiyun std r0,STACKFRAMESIZE+16(r1) 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun srdi r6,r5,7 78*4882a593Smuzhiyun mtctr r6 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun /* Now do cacheline (128B) sized loads and stores. */ 81*4882a593Smuzhiyun .align 5 82*4882a593Smuzhiyun4: 83*4882a593Smuzhiyun ld r0,0(r4) 84*4882a593Smuzhiyun ld r6,8(r4) 85*4882a593Smuzhiyun ld r7,16(r4) 86*4882a593Smuzhiyun ld r8,24(r4) 87*4882a593Smuzhiyun ld r9,32(r4) 88*4882a593Smuzhiyun ld r10,40(r4) 89*4882a593Smuzhiyun ld r11,48(r4) 90*4882a593Smuzhiyun ld r12,56(r4) 91*4882a593Smuzhiyun ld r14,64(r4) 92*4882a593Smuzhiyun ld r15,72(r4) 93*4882a593Smuzhiyun ld r16,80(r4) 94*4882a593Smuzhiyun ld r17,88(r4) 95*4882a593Smuzhiyun ld r18,96(r4) 96*4882a593Smuzhiyun ld r19,104(r4) 97*4882a593Smuzhiyun ld r20,112(r4) 98*4882a593Smuzhiyun ld r21,120(r4) 99*4882a593Smuzhiyun addi r4,r4,128 100*4882a593Smuzhiyun std r0,0(r3) 101*4882a593Smuzhiyun std r6,8(r3) 102*4882a593Smuzhiyun std r7,16(r3) 103*4882a593Smuzhiyun std r8,24(r3) 104*4882a593Smuzhiyun std r9,32(r3) 105*4882a593Smuzhiyun std r10,40(r3) 106*4882a593Smuzhiyun std r11,48(r3) 107*4882a593Smuzhiyun std r12,56(r3) 108*4882a593Smuzhiyun std r14,64(r3) 109*4882a593Smuzhiyun std r15,72(r3) 110*4882a593Smuzhiyun std r16,80(r3) 111*4882a593Smuzhiyun std r17,88(r3) 112*4882a593Smuzhiyun std r18,96(r3) 113*4882a593Smuzhiyun std r19,104(r3) 114*4882a593Smuzhiyun std r20,112(r3) 115*4882a593Smuzhiyun std r21,120(r3) 116*4882a593Smuzhiyun addi r3,r3,128 117*4882a593Smuzhiyun bdnz 4b 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun clrldi r5,r5,(64-7) 120*4882a593Smuzhiyun 121*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 122*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 123*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 124*4882a593Smuzhiyun ld r17,STK_REG(R17)(r1) 125*4882a593Smuzhiyun ld r18,STK_REG(R18)(r1) 126*4882a593Smuzhiyun ld r19,STK_REG(R19)(r1) 127*4882a593Smuzhiyun ld r20,STK_REG(R20)(r1) 128*4882a593Smuzhiyun ld r21,STK_REG(R21)(r1) 129*4882a593Smuzhiyun ld r22,STK_REG(R22)(r1) 130*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun /* Up to 127B to go */ 133*4882a593Smuzhiyun5: srdi r6,r5,4 134*4882a593Smuzhiyun mtocrf 0x01,r6 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun6: bf cr7*4+1,7f 137*4882a593Smuzhiyun ld r0,0(r4) 138*4882a593Smuzhiyun ld r6,8(r4) 139*4882a593Smuzhiyun ld r7,16(r4) 140*4882a593Smuzhiyun ld r8,24(r4) 141*4882a593Smuzhiyun ld r9,32(r4) 142*4882a593Smuzhiyun ld r10,40(r4) 143*4882a593Smuzhiyun ld r11,48(r4) 144*4882a593Smuzhiyun ld r12,56(r4) 145*4882a593Smuzhiyun addi r4,r4,64 146*4882a593Smuzhiyun std r0,0(r3) 147*4882a593Smuzhiyun std r6,8(r3) 148*4882a593Smuzhiyun std r7,16(r3) 149*4882a593Smuzhiyun std r8,24(r3) 150*4882a593Smuzhiyun std r9,32(r3) 151*4882a593Smuzhiyun std r10,40(r3) 152*4882a593Smuzhiyun std r11,48(r3) 153*4882a593Smuzhiyun std r12,56(r3) 154*4882a593Smuzhiyun addi r3,r3,64 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun /* Up to 63B to go */ 157*4882a593Smuzhiyun7: bf cr7*4+2,8f 158*4882a593Smuzhiyun ld r0,0(r4) 159*4882a593Smuzhiyun ld r6,8(r4) 160*4882a593Smuzhiyun ld r7,16(r4) 161*4882a593Smuzhiyun ld r8,24(r4) 162*4882a593Smuzhiyun addi r4,r4,32 163*4882a593Smuzhiyun std r0,0(r3) 164*4882a593Smuzhiyun std r6,8(r3) 165*4882a593Smuzhiyun std r7,16(r3) 166*4882a593Smuzhiyun std r8,24(r3) 167*4882a593Smuzhiyun addi r3,r3,32 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun /* Up to 31B to go */ 170*4882a593Smuzhiyun8: bf cr7*4+3,9f 171*4882a593Smuzhiyun ld r0,0(r4) 172*4882a593Smuzhiyun ld r6,8(r4) 173*4882a593Smuzhiyun addi r4,r4,16 174*4882a593Smuzhiyun std r0,0(r3) 175*4882a593Smuzhiyun std r6,8(r3) 176*4882a593Smuzhiyun addi r3,r3,16 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun9: clrldi r5,r5,(64-4) 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun /* Up to 15B to go */ 181*4882a593Smuzhiyun.Lshort_copy: 182*4882a593Smuzhiyun mtocrf 0x01,r5 183*4882a593Smuzhiyun bf cr7*4+0,12f 184*4882a593Smuzhiyun lwz r0,0(r4) /* Less chance of a reject with word ops */ 185*4882a593Smuzhiyun lwz r6,4(r4) 186*4882a593Smuzhiyun addi r4,r4,8 187*4882a593Smuzhiyun stw r0,0(r3) 188*4882a593Smuzhiyun stw r6,4(r3) 189*4882a593Smuzhiyun addi r3,r3,8 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun12: bf cr7*4+1,13f 192*4882a593Smuzhiyun lwz r0,0(r4) 193*4882a593Smuzhiyun addi r4,r4,4 194*4882a593Smuzhiyun stw r0,0(r3) 195*4882a593Smuzhiyun addi r3,r3,4 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun13: bf cr7*4+2,14f 198*4882a593Smuzhiyun lhz r0,0(r4) 199*4882a593Smuzhiyun addi r4,r4,2 200*4882a593Smuzhiyun sth r0,0(r3) 201*4882a593Smuzhiyun addi r3,r3,2 202*4882a593Smuzhiyun 203*4882a593Smuzhiyun14: bf cr7*4+3,15f 204*4882a593Smuzhiyun lbz r0,0(r4) 205*4882a593Smuzhiyun stb r0,0(r3) 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 208*4882a593Smuzhiyun blr 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun.Lunwind_stack_nonvmx_copy: 211*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE 212*4882a593Smuzhiyun b .Lnonvmx_copy 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun.Lvmx_copy: 215*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 216*4882a593Smuzhiyun mflr r0 217*4882a593Smuzhiyun std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 218*4882a593Smuzhiyun std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 219*4882a593Smuzhiyun std r0,16(r1) 220*4882a593Smuzhiyun stdu r1,-STACKFRAMESIZE(r1) 221*4882a593Smuzhiyun bl enter_vmx_ops 222*4882a593Smuzhiyun cmpwi cr1,r3,0 223*4882a593Smuzhiyun ld r0,STACKFRAMESIZE+16(r1) 224*4882a593Smuzhiyun ld r3,STK_REG(R31)(r1) 225*4882a593Smuzhiyun ld r4,STK_REG(R30)(r1) 226*4882a593Smuzhiyun ld r5,STK_REG(R29)(r1) 227*4882a593Smuzhiyun mtlr r0 228*4882a593Smuzhiyun 229*4882a593Smuzhiyun /* 230*4882a593Smuzhiyun * We prefetch both the source and destination using enhanced touch 231*4882a593Smuzhiyun * instructions. We use a stream ID of 0 for the load side and 232*4882a593Smuzhiyun * 1 for the store side. 233*4882a593Smuzhiyun */ 234*4882a593Smuzhiyun clrrdi r6,r4,7 235*4882a593Smuzhiyun clrrdi r9,r3,7 236*4882a593Smuzhiyun ori r9,r9,1 /* stream=1 */ 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 239*4882a593Smuzhiyun cmpldi r7,0x3FF 240*4882a593Smuzhiyun ble 1f 241*4882a593Smuzhiyun li r7,0x3FF 242*4882a593Smuzhiyun1: lis r0,0x0E00 /* depth=7 */ 243*4882a593Smuzhiyun sldi r7,r7,7 244*4882a593Smuzhiyun or r7,r7,r0 245*4882a593Smuzhiyun ori r10,r7,1 /* stream=1 */ 246*4882a593Smuzhiyun 247*4882a593Smuzhiyun lis r8,0x8000 /* GO=1 */ 248*4882a593Smuzhiyun clrldi r8,r8,32 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun dcbt 0,r6,0b01000 251*4882a593Smuzhiyun dcbt 0,r7,0b01010 252*4882a593Smuzhiyun dcbtst 0,r9,0b01000 253*4882a593Smuzhiyun dcbtst 0,r10,0b01010 254*4882a593Smuzhiyun eieio 255*4882a593Smuzhiyun dcbt 0,r8,0b01010 /* GO */ 256*4882a593Smuzhiyun 257*4882a593Smuzhiyun beq cr1,.Lunwind_stack_nonvmx_copy 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun /* 260*4882a593Smuzhiyun * If source and destination are not relatively aligned we use a 261*4882a593Smuzhiyun * slower permute loop. 262*4882a593Smuzhiyun */ 263*4882a593Smuzhiyun xor r6,r4,r3 264*4882a593Smuzhiyun rldicl. r6,r6,0,(64-4) 265*4882a593Smuzhiyun bne .Lvmx_unaligned_copy 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun /* Get the destination 16B aligned */ 268*4882a593Smuzhiyun neg r6,r3 269*4882a593Smuzhiyun mtocrf 0x01,r6 270*4882a593Smuzhiyun clrldi r6,r6,(64-4) 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun bf cr7*4+3,1f 273*4882a593Smuzhiyun lbz r0,0(r4) 274*4882a593Smuzhiyun addi r4,r4,1 275*4882a593Smuzhiyun stb r0,0(r3) 276*4882a593Smuzhiyun addi r3,r3,1 277*4882a593Smuzhiyun 278*4882a593Smuzhiyun1: bf cr7*4+2,2f 279*4882a593Smuzhiyun lhz r0,0(r4) 280*4882a593Smuzhiyun addi r4,r4,2 281*4882a593Smuzhiyun sth r0,0(r3) 282*4882a593Smuzhiyun addi r3,r3,2 283*4882a593Smuzhiyun 284*4882a593Smuzhiyun2: bf cr7*4+1,3f 285*4882a593Smuzhiyun lwz r0,0(r4) 286*4882a593Smuzhiyun addi r4,r4,4 287*4882a593Smuzhiyun stw r0,0(r3) 288*4882a593Smuzhiyun addi r3,r3,4 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun3: bf cr7*4+0,4f 291*4882a593Smuzhiyun ld r0,0(r4) 292*4882a593Smuzhiyun addi r4,r4,8 293*4882a593Smuzhiyun std r0,0(r3) 294*4882a593Smuzhiyun addi r3,r3,8 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun4: sub r5,r5,r6 297*4882a593Smuzhiyun 298*4882a593Smuzhiyun /* Get the desination 128B aligned */ 299*4882a593Smuzhiyun neg r6,r3 300*4882a593Smuzhiyun srdi r7,r6,4 301*4882a593Smuzhiyun mtocrf 0x01,r7 302*4882a593Smuzhiyun clrldi r6,r6,(64-7) 303*4882a593Smuzhiyun 304*4882a593Smuzhiyun li r9,16 305*4882a593Smuzhiyun li r10,32 306*4882a593Smuzhiyun li r11,48 307*4882a593Smuzhiyun 308*4882a593Smuzhiyun bf cr7*4+3,5f 309*4882a593Smuzhiyun lvx v1,0,r4 310*4882a593Smuzhiyun addi r4,r4,16 311*4882a593Smuzhiyun stvx v1,0,r3 312*4882a593Smuzhiyun addi r3,r3,16 313*4882a593Smuzhiyun 314*4882a593Smuzhiyun5: bf cr7*4+2,6f 315*4882a593Smuzhiyun lvx v1,0,r4 316*4882a593Smuzhiyun lvx v0,r4,r9 317*4882a593Smuzhiyun addi r4,r4,32 318*4882a593Smuzhiyun stvx v1,0,r3 319*4882a593Smuzhiyun stvx v0,r3,r9 320*4882a593Smuzhiyun addi r3,r3,32 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun6: bf cr7*4+1,7f 323*4882a593Smuzhiyun lvx v3,0,r4 324*4882a593Smuzhiyun lvx v2,r4,r9 325*4882a593Smuzhiyun lvx v1,r4,r10 326*4882a593Smuzhiyun lvx v0,r4,r11 327*4882a593Smuzhiyun addi r4,r4,64 328*4882a593Smuzhiyun stvx v3,0,r3 329*4882a593Smuzhiyun stvx v2,r3,r9 330*4882a593Smuzhiyun stvx v1,r3,r10 331*4882a593Smuzhiyun stvx v0,r3,r11 332*4882a593Smuzhiyun addi r3,r3,64 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun7: sub r5,r5,r6 335*4882a593Smuzhiyun srdi r6,r5,7 336*4882a593Smuzhiyun 337*4882a593Smuzhiyun std r14,STK_REG(R14)(r1) 338*4882a593Smuzhiyun std r15,STK_REG(R15)(r1) 339*4882a593Smuzhiyun std r16,STK_REG(R16)(r1) 340*4882a593Smuzhiyun 341*4882a593Smuzhiyun li r12,64 342*4882a593Smuzhiyun li r14,80 343*4882a593Smuzhiyun li r15,96 344*4882a593Smuzhiyun li r16,112 345*4882a593Smuzhiyun 346*4882a593Smuzhiyun mtctr r6 347*4882a593Smuzhiyun 348*4882a593Smuzhiyun /* 349*4882a593Smuzhiyun * Now do cacheline sized loads and stores. By this stage the 350*4882a593Smuzhiyun * cacheline stores are also cacheline aligned. 351*4882a593Smuzhiyun */ 352*4882a593Smuzhiyun .align 5 353*4882a593Smuzhiyun8: 354*4882a593Smuzhiyun lvx v7,0,r4 355*4882a593Smuzhiyun lvx v6,r4,r9 356*4882a593Smuzhiyun lvx v5,r4,r10 357*4882a593Smuzhiyun lvx v4,r4,r11 358*4882a593Smuzhiyun lvx v3,r4,r12 359*4882a593Smuzhiyun lvx v2,r4,r14 360*4882a593Smuzhiyun lvx v1,r4,r15 361*4882a593Smuzhiyun lvx v0,r4,r16 362*4882a593Smuzhiyun addi r4,r4,128 363*4882a593Smuzhiyun stvx v7,0,r3 364*4882a593Smuzhiyun stvx v6,r3,r9 365*4882a593Smuzhiyun stvx v5,r3,r10 366*4882a593Smuzhiyun stvx v4,r3,r11 367*4882a593Smuzhiyun stvx v3,r3,r12 368*4882a593Smuzhiyun stvx v2,r3,r14 369*4882a593Smuzhiyun stvx v1,r3,r15 370*4882a593Smuzhiyun stvx v0,r3,r16 371*4882a593Smuzhiyun addi r3,r3,128 372*4882a593Smuzhiyun bdnz 8b 373*4882a593Smuzhiyun 374*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 375*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 376*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun /* Up to 127B to go */ 379*4882a593Smuzhiyun clrldi r5,r5,(64-7) 380*4882a593Smuzhiyun srdi r6,r5,4 381*4882a593Smuzhiyun mtocrf 0x01,r6 382*4882a593Smuzhiyun 383*4882a593Smuzhiyun bf cr7*4+1,9f 384*4882a593Smuzhiyun lvx v3,0,r4 385*4882a593Smuzhiyun lvx v2,r4,r9 386*4882a593Smuzhiyun lvx v1,r4,r10 387*4882a593Smuzhiyun lvx v0,r4,r11 388*4882a593Smuzhiyun addi r4,r4,64 389*4882a593Smuzhiyun stvx v3,0,r3 390*4882a593Smuzhiyun stvx v2,r3,r9 391*4882a593Smuzhiyun stvx v1,r3,r10 392*4882a593Smuzhiyun stvx v0,r3,r11 393*4882a593Smuzhiyun addi r3,r3,64 394*4882a593Smuzhiyun 395*4882a593Smuzhiyun9: bf cr7*4+2,10f 396*4882a593Smuzhiyun lvx v1,0,r4 397*4882a593Smuzhiyun lvx v0,r4,r9 398*4882a593Smuzhiyun addi r4,r4,32 399*4882a593Smuzhiyun stvx v1,0,r3 400*4882a593Smuzhiyun stvx v0,r3,r9 401*4882a593Smuzhiyun addi r3,r3,32 402*4882a593Smuzhiyun 403*4882a593Smuzhiyun10: bf cr7*4+3,11f 404*4882a593Smuzhiyun lvx v1,0,r4 405*4882a593Smuzhiyun addi r4,r4,16 406*4882a593Smuzhiyun stvx v1,0,r3 407*4882a593Smuzhiyun addi r3,r3,16 408*4882a593Smuzhiyun 409*4882a593Smuzhiyun /* Up to 15B to go */ 410*4882a593Smuzhiyun11: clrldi r5,r5,(64-4) 411*4882a593Smuzhiyun mtocrf 0x01,r5 412*4882a593Smuzhiyun bf cr7*4+0,12f 413*4882a593Smuzhiyun ld r0,0(r4) 414*4882a593Smuzhiyun addi r4,r4,8 415*4882a593Smuzhiyun std r0,0(r3) 416*4882a593Smuzhiyun addi r3,r3,8 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun12: bf cr7*4+1,13f 419*4882a593Smuzhiyun lwz r0,0(r4) 420*4882a593Smuzhiyun addi r4,r4,4 421*4882a593Smuzhiyun stw r0,0(r3) 422*4882a593Smuzhiyun addi r3,r3,4 423*4882a593Smuzhiyun 424*4882a593Smuzhiyun13: bf cr7*4+2,14f 425*4882a593Smuzhiyun lhz r0,0(r4) 426*4882a593Smuzhiyun addi r4,r4,2 427*4882a593Smuzhiyun sth r0,0(r3) 428*4882a593Smuzhiyun addi r3,r3,2 429*4882a593Smuzhiyun 430*4882a593Smuzhiyun14: bf cr7*4+3,15f 431*4882a593Smuzhiyun lbz r0,0(r4) 432*4882a593Smuzhiyun stb r0,0(r3) 433*4882a593Smuzhiyun 434*4882a593Smuzhiyun15: addi r1,r1,STACKFRAMESIZE 435*4882a593Smuzhiyun ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 436*4882a593Smuzhiyun b exit_vmx_ops /* tail call optimise */ 437*4882a593Smuzhiyun 438*4882a593Smuzhiyun.Lvmx_unaligned_copy: 439*4882a593Smuzhiyun /* Get the destination 16B aligned */ 440*4882a593Smuzhiyun neg r6,r3 441*4882a593Smuzhiyun mtocrf 0x01,r6 442*4882a593Smuzhiyun clrldi r6,r6,(64-4) 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun bf cr7*4+3,1f 445*4882a593Smuzhiyun lbz r0,0(r4) 446*4882a593Smuzhiyun addi r4,r4,1 447*4882a593Smuzhiyun stb r0,0(r3) 448*4882a593Smuzhiyun addi r3,r3,1 449*4882a593Smuzhiyun 450*4882a593Smuzhiyun1: bf cr7*4+2,2f 451*4882a593Smuzhiyun lhz r0,0(r4) 452*4882a593Smuzhiyun addi r4,r4,2 453*4882a593Smuzhiyun sth r0,0(r3) 454*4882a593Smuzhiyun addi r3,r3,2 455*4882a593Smuzhiyun 456*4882a593Smuzhiyun2: bf cr7*4+1,3f 457*4882a593Smuzhiyun lwz r0,0(r4) 458*4882a593Smuzhiyun addi r4,r4,4 459*4882a593Smuzhiyun stw r0,0(r3) 460*4882a593Smuzhiyun addi r3,r3,4 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun3: bf cr7*4+0,4f 463*4882a593Smuzhiyun lwz r0,0(r4) /* Less chance of a reject with word ops */ 464*4882a593Smuzhiyun lwz r7,4(r4) 465*4882a593Smuzhiyun addi r4,r4,8 466*4882a593Smuzhiyun stw r0,0(r3) 467*4882a593Smuzhiyun stw r7,4(r3) 468*4882a593Smuzhiyun addi r3,r3,8 469*4882a593Smuzhiyun 470*4882a593Smuzhiyun4: sub r5,r5,r6 471*4882a593Smuzhiyun 472*4882a593Smuzhiyun /* Get the desination 128B aligned */ 473*4882a593Smuzhiyun neg r6,r3 474*4882a593Smuzhiyun srdi r7,r6,4 475*4882a593Smuzhiyun mtocrf 0x01,r7 476*4882a593Smuzhiyun clrldi r6,r6,(64-7) 477*4882a593Smuzhiyun 478*4882a593Smuzhiyun li r9,16 479*4882a593Smuzhiyun li r10,32 480*4882a593Smuzhiyun li r11,48 481*4882a593Smuzhiyun 482*4882a593Smuzhiyun LVS(v16,0,r4) /* Setup permute control vector */ 483*4882a593Smuzhiyun lvx v0,0,r4 484*4882a593Smuzhiyun addi r4,r4,16 485*4882a593Smuzhiyun 486*4882a593Smuzhiyun bf cr7*4+3,5f 487*4882a593Smuzhiyun lvx v1,0,r4 488*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 489*4882a593Smuzhiyun addi r4,r4,16 490*4882a593Smuzhiyun stvx v8,0,r3 491*4882a593Smuzhiyun addi r3,r3,16 492*4882a593Smuzhiyun vor v0,v1,v1 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun5: bf cr7*4+2,6f 495*4882a593Smuzhiyun lvx v1,0,r4 496*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 497*4882a593Smuzhiyun lvx v0,r4,r9 498*4882a593Smuzhiyun VPERM(v9,v1,v0,v16) 499*4882a593Smuzhiyun addi r4,r4,32 500*4882a593Smuzhiyun stvx v8,0,r3 501*4882a593Smuzhiyun stvx v9,r3,r9 502*4882a593Smuzhiyun addi r3,r3,32 503*4882a593Smuzhiyun 504*4882a593Smuzhiyun6: bf cr7*4+1,7f 505*4882a593Smuzhiyun lvx v3,0,r4 506*4882a593Smuzhiyun VPERM(v8,v0,v3,v16) 507*4882a593Smuzhiyun lvx v2,r4,r9 508*4882a593Smuzhiyun VPERM(v9,v3,v2,v16) 509*4882a593Smuzhiyun lvx v1,r4,r10 510*4882a593Smuzhiyun VPERM(v10,v2,v1,v16) 511*4882a593Smuzhiyun lvx v0,r4,r11 512*4882a593Smuzhiyun VPERM(v11,v1,v0,v16) 513*4882a593Smuzhiyun addi r4,r4,64 514*4882a593Smuzhiyun stvx v8,0,r3 515*4882a593Smuzhiyun stvx v9,r3,r9 516*4882a593Smuzhiyun stvx v10,r3,r10 517*4882a593Smuzhiyun stvx v11,r3,r11 518*4882a593Smuzhiyun addi r3,r3,64 519*4882a593Smuzhiyun 520*4882a593Smuzhiyun7: sub r5,r5,r6 521*4882a593Smuzhiyun srdi r6,r5,7 522*4882a593Smuzhiyun 523*4882a593Smuzhiyun std r14,STK_REG(R14)(r1) 524*4882a593Smuzhiyun std r15,STK_REG(R15)(r1) 525*4882a593Smuzhiyun std r16,STK_REG(R16)(r1) 526*4882a593Smuzhiyun 527*4882a593Smuzhiyun li r12,64 528*4882a593Smuzhiyun li r14,80 529*4882a593Smuzhiyun li r15,96 530*4882a593Smuzhiyun li r16,112 531*4882a593Smuzhiyun 532*4882a593Smuzhiyun mtctr r6 533*4882a593Smuzhiyun 534*4882a593Smuzhiyun /* 535*4882a593Smuzhiyun * Now do cacheline sized loads and stores. By this stage the 536*4882a593Smuzhiyun * cacheline stores are also cacheline aligned. 537*4882a593Smuzhiyun */ 538*4882a593Smuzhiyun .align 5 539*4882a593Smuzhiyun8: 540*4882a593Smuzhiyun lvx v7,0,r4 541*4882a593Smuzhiyun VPERM(v8,v0,v7,v16) 542*4882a593Smuzhiyun lvx v6,r4,r9 543*4882a593Smuzhiyun VPERM(v9,v7,v6,v16) 544*4882a593Smuzhiyun lvx v5,r4,r10 545*4882a593Smuzhiyun VPERM(v10,v6,v5,v16) 546*4882a593Smuzhiyun lvx v4,r4,r11 547*4882a593Smuzhiyun VPERM(v11,v5,v4,v16) 548*4882a593Smuzhiyun lvx v3,r4,r12 549*4882a593Smuzhiyun VPERM(v12,v4,v3,v16) 550*4882a593Smuzhiyun lvx v2,r4,r14 551*4882a593Smuzhiyun VPERM(v13,v3,v2,v16) 552*4882a593Smuzhiyun lvx v1,r4,r15 553*4882a593Smuzhiyun VPERM(v14,v2,v1,v16) 554*4882a593Smuzhiyun lvx v0,r4,r16 555*4882a593Smuzhiyun VPERM(v15,v1,v0,v16) 556*4882a593Smuzhiyun addi r4,r4,128 557*4882a593Smuzhiyun stvx v8,0,r3 558*4882a593Smuzhiyun stvx v9,r3,r9 559*4882a593Smuzhiyun stvx v10,r3,r10 560*4882a593Smuzhiyun stvx v11,r3,r11 561*4882a593Smuzhiyun stvx v12,r3,r12 562*4882a593Smuzhiyun stvx v13,r3,r14 563*4882a593Smuzhiyun stvx v14,r3,r15 564*4882a593Smuzhiyun stvx v15,r3,r16 565*4882a593Smuzhiyun addi r3,r3,128 566*4882a593Smuzhiyun bdnz 8b 567*4882a593Smuzhiyun 568*4882a593Smuzhiyun ld r14,STK_REG(R14)(r1) 569*4882a593Smuzhiyun ld r15,STK_REG(R15)(r1) 570*4882a593Smuzhiyun ld r16,STK_REG(R16)(r1) 571*4882a593Smuzhiyun 572*4882a593Smuzhiyun /* Up to 127B to go */ 573*4882a593Smuzhiyun clrldi r5,r5,(64-7) 574*4882a593Smuzhiyun srdi r6,r5,4 575*4882a593Smuzhiyun mtocrf 0x01,r6 576*4882a593Smuzhiyun 577*4882a593Smuzhiyun bf cr7*4+1,9f 578*4882a593Smuzhiyun lvx v3,0,r4 579*4882a593Smuzhiyun VPERM(v8,v0,v3,v16) 580*4882a593Smuzhiyun lvx v2,r4,r9 581*4882a593Smuzhiyun VPERM(v9,v3,v2,v16) 582*4882a593Smuzhiyun lvx v1,r4,r10 583*4882a593Smuzhiyun VPERM(v10,v2,v1,v16) 584*4882a593Smuzhiyun lvx v0,r4,r11 585*4882a593Smuzhiyun VPERM(v11,v1,v0,v16) 586*4882a593Smuzhiyun addi r4,r4,64 587*4882a593Smuzhiyun stvx v8,0,r3 588*4882a593Smuzhiyun stvx v9,r3,r9 589*4882a593Smuzhiyun stvx v10,r3,r10 590*4882a593Smuzhiyun stvx v11,r3,r11 591*4882a593Smuzhiyun addi r3,r3,64 592*4882a593Smuzhiyun 593*4882a593Smuzhiyun9: bf cr7*4+2,10f 594*4882a593Smuzhiyun lvx v1,0,r4 595*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 596*4882a593Smuzhiyun lvx v0,r4,r9 597*4882a593Smuzhiyun VPERM(v9,v1,v0,v16) 598*4882a593Smuzhiyun addi r4,r4,32 599*4882a593Smuzhiyun stvx v8,0,r3 600*4882a593Smuzhiyun stvx v9,r3,r9 601*4882a593Smuzhiyun addi r3,r3,32 602*4882a593Smuzhiyun 603*4882a593Smuzhiyun10: bf cr7*4+3,11f 604*4882a593Smuzhiyun lvx v1,0,r4 605*4882a593Smuzhiyun VPERM(v8,v0,v1,v16) 606*4882a593Smuzhiyun addi r4,r4,16 607*4882a593Smuzhiyun stvx v8,0,r3 608*4882a593Smuzhiyun addi r3,r3,16 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun /* Up to 15B to go */ 611*4882a593Smuzhiyun11: clrldi r5,r5,(64-4) 612*4882a593Smuzhiyun addi r4,r4,-16 /* Unwind the +16 load offset */ 613*4882a593Smuzhiyun mtocrf 0x01,r5 614*4882a593Smuzhiyun bf cr7*4+0,12f 615*4882a593Smuzhiyun lwz r0,0(r4) /* Less chance of a reject with word ops */ 616*4882a593Smuzhiyun lwz r6,4(r4) 617*4882a593Smuzhiyun addi r4,r4,8 618*4882a593Smuzhiyun stw r0,0(r3) 619*4882a593Smuzhiyun stw r6,4(r3) 620*4882a593Smuzhiyun addi r3,r3,8 621*4882a593Smuzhiyun 622*4882a593Smuzhiyun12: bf cr7*4+1,13f 623*4882a593Smuzhiyun lwz r0,0(r4) 624*4882a593Smuzhiyun addi r4,r4,4 625*4882a593Smuzhiyun stw r0,0(r3) 626*4882a593Smuzhiyun addi r3,r3,4 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun13: bf cr7*4+2,14f 629*4882a593Smuzhiyun lhz r0,0(r4) 630*4882a593Smuzhiyun addi r4,r4,2 631*4882a593Smuzhiyun sth r0,0(r3) 632*4882a593Smuzhiyun addi r3,r3,2 633*4882a593Smuzhiyun 634*4882a593Smuzhiyun14: bf cr7*4+3,15f 635*4882a593Smuzhiyun lbz r0,0(r4) 636*4882a593Smuzhiyun stb r0,0(r3) 637*4882a593Smuzhiyun 638*4882a593Smuzhiyun15: addi r1,r1,STACKFRAMESIZE 639*4882a593Smuzhiyun ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 640*4882a593Smuzhiyun b exit_vmx_ops /* tail call optimise */ 641*4882a593Smuzhiyun#endif /* CONFIG_ALTIVEC */ 642