1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Author: Anton Blanchard <anton@au.ibm.com> 4*4882a593Smuzhiyun * Copyright 2015 IBM Corporation. 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun#include <asm/ppc_asm.h> 7*4882a593Smuzhiyun#include <asm/export.h> 8*4882a593Smuzhiyun#include <asm/ppc-opcode.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#define off8 r6 11*4882a593Smuzhiyun#define off16 r7 12*4882a593Smuzhiyun#define off24 r8 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun#define rA r9 15*4882a593Smuzhiyun#define rB r10 16*4882a593Smuzhiyun#define rC r11 17*4882a593Smuzhiyun#define rD r27 18*4882a593Smuzhiyun#define rE r28 19*4882a593Smuzhiyun#define rF r29 20*4882a593Smuzhiyun#define rG r30 21*4882a593Smuzhiyun#define rH r31 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__ 24*4882a593Smuzhiyun#define LH lhbrx 25*4882a593Smuzhiyun#define LW lwbrx 26*4882a593Smuzhiyun#define LD ldbrx 27*4882a593Smuzhiyun#define LVS lvsr 28*4882a593Smuzhiyun#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 29*4882a593Smuzhiyun vperm _VRT,_VRB,_VRA,_VRC 30*4882a593Smuzhiyun#else 31*4882a593Smuzhiyun#define LH lhzx 32*4882a593Smuzhiyun#define LW lwzx 33*4882a593Smuzhiyun#define LD ldx 34*4882a593Smuzhiyun#define LVS lvsl 35*4882a593Smuzhiyun#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 36*4882a593Smuzhiyun vperm _VRT,_VRA,_VRB,_VRC 37*4882a593Smuzhiyun#endif 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define VMX_THRESH 4096 40*4882a593Smuzhiyun#define ENTER_VMX_OPS \ 41*4882a593Smuzhiyun mflr r0; \ 42*4882a593Smuzhiyun std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 43*4882a593Smuzhiyun std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 44*4882a593Smuzhiyun std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 45*4882a593Smuzhiyun std r0,16(r1); \ 46*4882a593Smuzhiyun stdu r1,-STACKFRAMESIZE(r1); \ 47*4882a593Smuzhiyun bl enter_vmx_ops; \ 48*4882a593Smuzhiyun cmpwi cr1,r3,0; \ 49*4882a593Smuzhiyun ld r0,STACKFRAMESIZE+16(r1); \ 50*4882a593Smuzhiyun ld r3,STK_REG(R31)(r1); \ 51*4882a593Smuzhiyun ld r4,STK_REG(R30)(r1); \ 52*4882a593Smuzhiyun ld r5,STK_REG(R29)(r1); \ 53*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE; \ 54*4882a593Smuzhiyun mtlr r0 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun#define EXIT_VMX_OPS \ 57*4882a593Smuzhiyun mflr r0; \ 58*4882a593Smuzhiyun std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 59*4882a593Smuzhiyun std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 60*4882a593Smuzhiyun std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 61*4882a593Smuzhiyun std r0,16(r1); \ 62*4882a593Smuzhiyun stdu r1,-STACKFRAMESIZE(r1); \ 63*4882a593Smuzhiyun bl exit_vmx_ops; \ 64*4882a593Smuzhiyun ld r0,STACKFRAMESIZE+16(r1); \ 65*4882a593Smuzhiyun ld r3,STK_REG(R31)(r1); \ 66*4882a593Smuzhiyun ld r4,STK_REG(R30)(r1); \ 67*4882a593Smuzhiyun ld r5,STK_REG(R29)(r1); \ 68*4882a593Smuzhiyun addi r1,r1,STACKFRAMESIZE; \ 69*4882a593Smuzhiyun mtlr r0 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun/* 72*4882a593Smuzhiyun * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 73*4882a593Smuzhiyun * 16 bytes boundary and permute the result with the 1st 16 bytes. 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 76*4882a593Smuzhiyun * ^ ^ ^ 77*4882a593Smuzhiyun * 0xbbbb10 0xbbbb20 0xbbb30 78*4882a593Smuzhiyun * ^ 79*4882a593Smuzhiyun * _vaddr 80*4882a593Smuzhiyun * 81*4882a593Smuzhiyun * 82*4882a593Smuzhiyun * _vmask is the mask generated by LVS 83*4882a593Smuzhiyun * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 84*4882a593Smuzhiyun * for example: 0xyyyyyyyyyyyyy012 for big endian 85*4882a593Smuzhiyun * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 86*4882a593Smuzhiyun * for example: 0x3456789abcdefzzz for big endian 87*4882a593Smuzhiyun * The permute result is saved in _v_res. 88*4882a593Smuzhiyun * for example: 0x0123456789abcdef for big endian. 89*4882a593Smuzhiyun */ 90*4882a593Smuzhiyun#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 91*4882a593Smuzhiyun lvx _v2nd_qw,_vaddr,off16; \ 92*4882a593Smuzhiyun VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun/* 95*4882a593Smuzhiyun * There are 2 categories for memcmp: 96*4882a593Smuzhiyun * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 97*4882a593Smuzhiyun * are named like .Lsameoffset_xxxx 98*4882a593Smuzhiyun * 2) src/dst has different offset to the 8 bytes boundary. The handlers 99*4882a593Smuzhiyun * are named like .Ldiffoffset_xxxx 100*4882a593Smuzhiyun */ 101*4882a593Smuzhiyun_GLOBAL_TOC(memcmp) 102*4882a593Smuzhiyun cmpdi cr1,r5,0 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun /* Use the short loop if the src/dst addresses are not 105*4882a593Smuzhiyun * with the same offset of 8 bytes align boundary. 106*4882a593Smuzhiyun */ 107*4882a593Smuzhiyun xor r6,r3,r4 108*4882a593Smuzhiyun andi. r6,r6,7 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun /* Fall back to short loop if compare at aligned addrs 111*4882a593Smuzhiyun * with less than 8 bytes. 112*4882a593Smuzhiyun */ 113*4882a593Smuzhiyun cmpdi cr6,r5,7 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun beq cr1,.Lzero 116*4882a593Smuzhiyun bgt cr6,.Lno_short 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun.Lshort: 119*4882a593Smuzhiyun mtctr r5 120*4882a593Smuzhiyun1: lbz rA,0(r3) 121*4882a593Smuzhiyun lbz rB,0(r4) 122*4882a593Smuzhiyun subf. rC,rB,rA 123*4882a593Smuzhiyun bne .Lnon_zero 124*4882a593Smuzhiyun bdz .Lzero 125*4882a593Smuzhiyun 126*4882a593Smuzhiyun lbz rA,1(r3) 127*4882a593Smuzhiyun lbz rB,1(r4) 128*4882a593Smuzhiyun subf. rC,rB,rA 129*4882a593Smuzhiyun bne .Lnon_zero 130*4882a593Smuzhiyun bdz .Lzero 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun lbz rA,2(r3) 133*4882a593Smuzhiyun lbz rB,2(r4) 134*4882a593Smuzhiyun subf. rC,rB,rA 135*4882a593Smuzhiyun bne .Lnon_zero 136*4882a593Smuzhiyun bdz .Lzero 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun lbz rA,3(r3) 139*4882a593Smuzhiyun lbz rB,3(r4) 140*4882a593Smuzhiyun subf. rC,rB,rA 141*4882a593Smuzhiyun bne .Lnon_zero 142*4882a593Smuzhiyun 143*4882a593Smuzhiyun addi r3,r3,4 144*4882a593Smuzhiyun addi r4,r4,4 145*4882a593Smuzhiyun 146*4882a593Smuzhiyun bdnz 1b 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun.Lzero: 149*4882a593Smuzhiyun li r3,0 150*4882a593Smuzhiyun blr 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun.Lno_short: 153*4882a593Smuzhiyun dcbt 0,r3 154*4882a593Smuzhiyun dcbt 0,r4 155*4882a593Smuzhiyun bne .Ldiffoffset_8bytes_make_align_start 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun.Lsameoffset_8bytes_make_align_start: 159*4882a593Smuzhiyun /* attempt to compare bytes not aligned with 8 bytes so that 160*4882a593Smuzhiyun * rest comparison can run based on 8 bytes alignment. 161*4882a593Smuzhiyun */ 162*4882a593Smuzhiyun andi. r6,r3,7 163*4882a593Smuzhiyun 164*4882a593Smuzhiyun /* Try to compare the first double word which is not 8 bytes aligned: 165*4882a593Smuzhiyun * load the first double word at (src & ~7UL) and shift left appropriate 166*4882a593Smuzhiyun * bits before comparision. 167*4882a593Smuzhiyun */ 168*4882a593Smuzhiyun rlwinm r6,r3,3,26,28 169*4882a593Smuzhiyun beq .Lsameoffset_8bytes_aligned 170*4882a593Smuzhiyun clrrdi r3,r3,3 171*4882a593Smuzhiyun clrrdi r4,r4,3 172*4882a593Smuzhiyun LD rA,0,r3 173*4882a593Smuzhiyun LD rB,0,r4 174*4882a593Smuzhiyun sld rA,rA,r6 175*4882a593Smuzhiyun sld rB,rB,r6 176*4882a593Smuzhiyun cmpld cr0,rA,rB 177*4882a593Smuzhiyun srwi r6,r6,3 178*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 179*4882a593Smuzhiyun subfic r6,r6,8 180*4882a593Smuzhiyun subf. r5,r6,r5 181*4882a593Smuzhiyun addi r3,r3,8 182*4882a593Smuzhiyun addi r4,r4,8 183*4882a593Smuzhiyun beq .Lzero 184*4882a593Smuzhiyun 185*4882a593Smuzhiyun.Lsameoffset_8bytes_aligned: 186*4882a593Smuzhiyun /* now we are aligned with 8 bytes. 187*4882a593Smuzhiyun * Use .Llong loop if left cmp bytes are equal or greater than 32B. 188*4882a593Smuzhiyun */ 189*4882a593Smuzhiyun cmpdi cr6,r5,31 190*4882a593Smuzhiyun bgt cr6,.Llong 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun.Lcmp_lt32bytes: 193*4882a593Smuzhiyun /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 194*4882a593Smuzhiyun cmpdi cr5,r5,7 195*4882a593Smuzhiyun srdi r0,r5,3 196*4882a593Smuzhiyun ble cr5,.Lcmp_rest_lt8bytes 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun /* handle 8 ~ 31 bytes */ 199*4882a593Smuzhiyun clrldi r5,r5,61 200*4882a593Smuzhiyun mtctr r0 201*4882a593Smuzhiyun2: 202*4882a593Smuzhiyun LD rA,0,r3 203*4882a593Smuzhiyun LD rB,0,r4 204*4882a593Smuzhiyun cmpld cr0,rA,rB 205*4882a593Smuzhiyun addi r3,r3,8 206*4882a593Smuzhiyun addi r4,r4,8 207*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 208*4882a593Smuzhiyun bdnz 2b 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun cmpwi r5,0 211*4882a593Smuzhiyun beq .Lzero 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun.Lcmp_rest_lt8bytes: 214*4882a593Smuzhiyun /* 215*4882a593Smuzhiyun * Here we have less than 8 bytes to compare. At least s1 is aligned to 216*4882a593Smuzhiyun * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a 217*4882a593Smuzhiyun * page boundary, otherwise we might read past the end of the buffer and 218*4882a593Smuzhiyun * trigger a page fault. We use 4K as the conservative minimum page 219*4882a593Smuzhiyun * size. If we detect that case we go to the byte-by-byte loop. 220*4882a593Smuzhiyun * 221*4882a593Smuzhiyun * Otherwise the next double word is loaded from s1 and s2, and shifted 222*4882a593Smuzhiyun * right to compare the appropriate bits. 223*4882a593Smuzhiyun */ 224*4882a593Smuzhiyun clrldi r6,r4,(64-12) // r6 = r4 & 0xfff 225*4882a593Smuzhiyun cmpdi r6,0xff8 226*4882a593Smuzhiyun bgt .Lshort 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun subfic r6,r5,8 229*4882a593Smuzhiyun slwi r6,r6,3 230*4882a593Smuzhiyun LD rA,0,r3 231*4882a593Smuzhiyun LD rB,0,r4 232*4882a593Smuzhiyun srd rA,rA,r6 233*4882a593Smuzhiyun srd rB,rB,r6 234*4882a593Smuzhiyun cmpld cr0,rA,rB 235*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 236*4882a593Smuzhiyun b .Lzero 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun.Lnon_zero: 239*4882a593Smuzhiyun mr r3,rC 240*4882a593Smuzhiyun blr 241*4882a593Smuzhiyun 242*4882a593Smuzhiyun.Llong: 243*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 244*4882a593SmuzhiyunBEGIN_FTR_SECTION 245*4882a593Smuzhiyun /* Try to use vmx loop if length is equal or greater than 4K */ 246*4882a593Smuzhiyun cmpldi cr6,r5,VMX_THRESH 247*4882a593Smuzhiyun bge cr6,.Lsameoffset_vmx_cmp 248*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun.Llong_novmx_cmp: 251*4882a593Smuzhiyun#endif 252*4882a593Smuzhiyun /* At least s1 addr is aligned with 8 bytes */ 253*4882a593Smuzhiyun li off8,8 254*4882a593Smuzhiyun li off16,16 255*4882a593Smuzhiyun li off24,24 256*4882a593Smuzhiyun 257*4882a593Smuzhiyun std r31,-8(r1) 258*4882a593Smuzhiyun std r30,-16(r1) 259*4882a593Smuzhiyun std r29,-24(r1) 260*4882a593Smuzhiyun std r28,-32(r1) 261*4882a593Smuzhiyun std r27,-40(r1) 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun srdi r0,r5,5 264*4882a593Smuzhiyun mtctr r0 265*4882a593Smuzhiyun andi. r5,r5,31 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun LD rA,0,r3 268*4882a593Smuzhiyun LD rB,0,r4 269*4882a593Smuzhiyun 270*4882a593Smuzhiyun LD rC,off8,r3 271*4882a593Smuzhiyun LD rD,off8,r4 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun LD rE,off16,r3 274*4882a593Smuzhiyun LD rF,off16,r4 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun LD rG,off24,r3 277*4882a593Smuzhiyun LD rH,off24,r4 278*4882a593Smuzhiyun cmpld cr0,rA,rB 279*4882a593Smuzhiyun 280*4882a593Smuzhiyun addi r3,r3,32 281*4882a593Smuzhiyun addi r4,r4,32 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun bdz .Lfirst32 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun LD rA,0,r3 286*4882a593Smuzhiyun LD rB,0,r4 287*4882a593Smuzhiyun cmpld cr1,rC,rD 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun LD rC,off8,r3 290*4882a593Smuzhiyun LD rD,off8,r4 291*4882a593Smuzhiyun cmpld cr6,rE,rF 292*4882a593Smuzhiyun 293*4882a593Smuzhiyun LD rE,off16,r3 294*4882a593Smuzhiyun LD rF,off16,r4 295*4882a593Smuzhiyun cmpld cr7,rG,rH 296*4882a593Smuzhiyun bne cr0,.LcmpAB 297*4882a593Smuzhiyun 298*4882a593Smuzhiyun LD rG,off24,r3 299*4882a593Smuzhiyun LD rH,off24,r4 300*4882a593Smuzhiyun cmpld cr0,rA,rB 301*4882a593Smuzhiyun bne cr1,.LcmpCD 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun addi r3,r3,32 304*4882a593Smuzhiyun addi r4,r4,32 305*4882a593Smuzhiyun 306*4882a593Smuzhiyun bdz .Lsecond32 307*4882a593Smuzhiyun 308*4882a593Smuzhiyun .balign 16 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun1: LD rA,0,r3 311*4882a593Smuzhiyun LD rB,0,r4 312*4882a593Smuzhiyun cmpld cr1,rC,rD 313*4882a593Smuzhiyun bne cr6,.LcmpEF 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun LD rC,off8,r3 316*4882a593Smuzhiyun LD rD,off8,r4 317*4882a593Smuzhiyun cmpld cr6,rE,rF 318*4882a593Smuzhiyun bne cr7,.LcmpGH 319*4882a593Smuzhiyun 320*4882a593Smuzhiyun LD rE,off16,r3 321*4882a593Smuzhiyun LD rF,off16,r4 322*4882a593Smuzhiyun cmpld cr7,rG,rH 323*4882a593Smuzhiyun bne cr0,.LcmpAB 324*4882a593Smuzhiyun 325*4882a593Smuzhiyun LD rG,off24,r3 326*4882a593Smuzhiyun LD rH,off24,r4 327*4882a593Smuzhiyun cmpld cr0,rA,rB 328*4882a593Smuzhiyun bne cr1,.LcmpCD 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun addi r3,r3,32 331*4882a593Smuzhiyun addi r4,r4,32 332*4882a593Smuzhiyun 333*4882a593Smuzhiyun bdnz 1b 334*4882a593Smuzhiyun 335*4882a593Smuzhiyun.Lsecond32: 336*4882a593Smuzhiyun cmpld cr1,rC,rD 337*4882a593Smuzhiyun bne cr6,.LcmpEF 338*4882a593Smuzhiyun 339*4882a593Smuzhiyun cmpld cr6,rE,rF 340*4882a593Smuzhiyun bne cr7,.LcmpGH 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun cmpld cr7,rG,rH 343*4882a593Smuzhiyun bne cr0,.LcmpAB 344*4882a593Smuzhiyun 345*4882a593Smuzhiyun bne cr1,.LcmpCD 346*4882a593Smuzhiyun bne cr6,.LcmpEF 347*4882a593Smuzhiyun bne cr7,.LcmpGH 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun.Ltail: 350*4882a593Smuzhiyun ld r31,-8(r1) 351*4882a593Smuzhiyun ld r30,-16(r1) 352*4882a593Smuzhiyun ld r29,-24(r1) 353*4882a593Smuzhiyun ld r28,-32(r1) 354*4882a593Smuzhiyun ld r27,-40(r1) 355*4882a593Smuzhiyun 356*4882a593Smuzhiyun cmpdi r5,0 357*4882a593Smuzhiyun beq .Lzero 358*4882a593Smuzhiyun b .Lshort 359*4882a593Smuzhiyun 360*4882a593Smuzhiyun.Lfirst32: 361*4882a593Smuzhiyun cmpld cr1,rC,rD 362*4882a593Smuzhiyun cmpld cr6,rE,rF 363*4882a593Smuzhiyun cmpld cr7,rG,rH 364*4882a593Smuzhiyun 365*4882a593Smuzhiyun bne cr0,.LcmpAB 366*4882a593Smuzhiyun bne cr1,.LcmpCD 367*4882a593Smuzhiyun bne cr6,.LcmpEF 368*4882a593Smuzhiyun bne cr7,.LcmpGH 369*4882a593Smuzhiyun 370*4882a593Smuzhiyun b .Ltail 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun.LcmpAB: 373*4882a593Smuzhiyun li r3,1 374*4882a593Smuzhiyun bgt cr0,.Lout 375*4882a593Smuzhiyun li r3,-1 376*4882a593Smuzhiyun b .Lout 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun.LcmpCD: 379*4882a593Smuzhiyun li r3,1 380*4882a593Smuzhiyun bgt cr1,.Lout 381*4882a593Smuzhiyun li r3,-1 382*4882a593Smuzhiyun b .Lout 383*4882a593Smuzhiyun 384*4882a593Smuzhiyun.LcmpEF: 385*4882a593Smuzhiyun li r3,1 386*4882a593Smuzhiyun bgt cr6,.Lout 387*4882a593Smuzhiyun li r3,-1 388*4882a593Smuzhiyun b .Lout 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun.LcmpGH: 391*4882a593Smuzhiyun li r3,1 392*4882a593Smuzhiyun bgt cr7,.Lout 393*4882a593Smuzhiyun li r3,-1 394*4882a593Smuzhiyun 395*4882a593Smuzhiyun.Lout: 396*4882a593Smuzhiyun ld r31,-8(r1) 397*4882a593Smuzhiyun ld r30,-16(r1) 398*4882a593Smuzhiyun ld r29,-24(r1) 399*4882a593Smuzhiyun ld r28,-32(r1) 400*4882a593Smuzhiyun ld r27,-40(r1) 401*4882a593Smuzhiyun blr 402*4882a593Smuzhiyun 403*4882a593Smuzhiyun.LcmpAB_lightweight: /* skip NV GPRS restore */ 404*4882a593Smuzhiyun li r3,1 405*4882a593Smuzhiyun bgtlr 406*4882a593Smuzhiyun li r3,-1 407*4882a593Smuzhiyun blr 408*4882a593Smuzhiyun 409*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 410*4882a593Smuzhiyun.Lsameoffset_vmx_cmp: 411*4882a593Smuzhiyun /* Enter with src/dst addrs has the same offset with 8 bytes 412*4882a593Smuzhiyun * align boundary. 413*4882a593Smuzhiyun * 414*4882a593Smuzhiyun * There is an optimization based on following fact: memcmp() 415*4882a593Smuzhiyun * prones to fail early at the first 32 bytes. 416*4882a593Smuzhiyun * Before applying VMX instructions which will lead to 32x128bits 417*4882a593Smuzhiyun * VMX regs load/restore penalty, we compare the first 32 bytes 418*4882a593Smuzhiyun * so that we can catch the ~80% fail cases. 419*4882a593Smuzhiyun */ 420*4882a593Smuzhiyun 421*4882a593Smuzhiyun li r0,4 422*4882a593Smuzhiyun mtctr r0 423*4882a593Smuzhiyun.Lsameoffset_prechk_32B_loop: 424*4882a593Smuzhiyun LD rA,0,r3 425*4882a593Smuzhiyun LD rB,0,r4 426*4882a593Smuzhiyun cmpld cr0,rA,rB 427*4882a593Smuzhiyun addi r3,r3,8 428*4882a593Smuzhiyun addi r4,r4,8 429*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 430*4882a593Smuzhiyun addi r5,r5,-8 431*4882a593Smuzhiyun bdnz .Lsameoffset_prechk_32B_loop 432*4882a593Smuzhiyun 433*4882a593Smuzhiyun ENTER_VMX_OPS 434*4882a593Smuzhiyun beq cr1,.Llong_novmx_cmp 435*4882a593Smuzhiyun 436*4882a593Smuzhiyun3: 437*4882a593Smuzhiyun /* need to check whether r4 has the same offset with r3 438*4882a593Smuzhiyun * for 16 bytes boundary. 439*4882a593Smuzhiyun */ 440*4882a593Smuzhiyun xor r0,r3,r4 441*4882a593Smuzhiyun andi. r0,r0,0xf 442*4882a593Smuzhiyun bne .Ldiffoffset_vmx_cmp_start 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun /* len is no less than 4KB. Need to align with 16 bytes further. 445*4882a593Smuzhiyun */ 446*4882a593Smuzhiyun andi. rA,r3,8 447*4882a593Smuzhiyun LD rA,0,r3 448*4882a593Smuzhiyun beq 4f 449*4882a593Smuzhiyun LD rB,0,r4 450*4882a593Smuzhiyun cmpld cr0,rA,rB 451*4882a593Smuzhiyun addi r3,r3,8 452*4882a593Smuzhiyun addi r4,r4,8 453*4882a593Smuzhiyun addi r5,r5,-8 454*4882a593Smuzhiyun 455*4882a593Smuzhiyun beq cr0,4f 456*4882a593Smuzhiyun /* save and restore cr0 */ 457*4882a593Smuzhiyun mfocrf r5,128 458*4882a593Smuzhiyun EXIT_VMX_OPS 459*4882a593Smuzhiyun mtocrf 128,r5 460*4882a593Smuzhiyun b .LcmpAB_lightweight 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun4: 463*4882a593Smuzhiyun /* compare 32 bytes for each loop */ 464*4882a593Smuzhiyun srdi r0,r5,5 465*4882a593Smuzhiyun mtctr r0 466*4882a593Smuzhiyun clrldi r5,r5,59 467*4882a593Smuzhiyun li off16,16 468*4882a593Smuzhiyun 469*4882a593Smuzhiyun.balign 16 470*4882a593Smuzhiyun5: 471*4882a593Smuzhiyun lvx v0,0,r3 472*4882a593Smuzhiyun lvx v1,0,r4 473*4882a593Smuzhiyun VCMPEQUD_RC(v0,v0,v1) 474*4882a593Smuzhiyun bnl cr6,7f 475*4882a593Smuzhiyun lvx v0,off16,r3 476*4882a593Smuzhiyun lvx v1,off16,r4 477*4882a593Smuzhiyun VCMPEQUD_RC(v0,v0,v1) 478*4882a593Smuzhiyun bnl cr6,6f 479*4882a593Smuzhiyun addi r3,r3,32 480*4882a593Smuzhiyun addi r4,r4,32 481*4882a593Smuzhiyun bdnz 5b 482*4882a593Smuzhiyun 483*4882a593Smuzhiyun EXIT_VMX_OPS 484*4882a593Smuzhiyun cmpdi r5,0 485*4882a593Smuzhiyun beq .Lzero 486*4882a593Smuzhiyun b .Lcmp_lt32bytes 487*4882a593Smuzhiyun 488*4882a593Smuzhiyun6: 489*4882a593Smuzhiyun addi r3,r3,16 490*4882a593Smuzhiyun addi r4,r4,16 491*4882a593Smuzhiyun 492*4882a593Smuzhiyun7: 493*4882a593Smuzhiyun /* diff the last 16 bytes */ 494*4882a593Smuzhiyun EXIT_VMX_OPS 495*4882a593Smuzhiyun LD rA,0,r3 496*4882a593Smuzhiyun LD rB,0,r4 497*4882a593Smuzhiyun cmpld cr0,rA,rB 498*4882a593Smuzhiyun li off8,8 499*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 500*4882a593Smuzhiyun 501*4882a593Smuzhiyun LD rA,off8,r3 502*4882a593Smuzhiyun LD rB,off8,r4 503*4882a593Smuzhiyun cmpld cr0,rA,rB 504*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 505*4882a593Smuzhiyun b .Lzero 506*4882a593Smuzhiyun#endif 507*4882a593Smuzhiyun 508*4882a593Smuzhiyun.Ldiffoffset_8bytes_make_align_start: 509*4882a593Smuzhiyun /* now try to align s1 with 8 bytes */ 510*4882a593Smuzhiyun rlwinm r6,r3,3,26,28 511*4882a593Smuzhiyun beq .Ldiffoffset_align_s1_8bytes 512*4882a593Smuzhiyun 513*4882a593Smuzhiyun clrrdi r3,r3,3 514*4882a593Smuzhiyun LD rA,0,r3 515*4882a593Smuzhiyun LD rB,0,r4 /* unaligned load */ 516*4882a593Smuzhiyun sld rA,rA,r6 517*4882a593Smuzhiyun srd rA,rA,r6 518*4882a593Smuzhiyun srd rB,rB,r6 519*4882a593Smuzhiyun cmpld cr0,rA,rB 520*4882a593Smuzhiyun srwi r6,r6,3 521*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 522*4882a593Smuzhiyun 523*4882a593Smuzhiyun subfic r6,r6,8 524*4882a593Smuzhiyun subf. r5,r6,r5 525*4882a593Smuzhiyun addi r3,r3,8 526*4882a593Smuzhiyun add r4,r4,r6 527*4882a593Smuzhiyun 528*4882a593Smuzhiyun beq .Lzero 529*4882a593Smuzhiyun 530*4882a593Smuzhiyun.Ldiffoffset_align_s1_8bytes: 531*4882a593Smuzhiyun /* now s1 is aligned with 8 bytes. */ 532*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 533*4882a593SmuzhiyunBEGIN_FTR_SECTION 534*4882a593Smuzhiyun /* only do vmx ops when the size equal or greater than 4K bytes */ 535*4882a593Smuzhiyun cmpdi cr5,r5,VMX_THRESH 536*4882a593Smuzhiyun bge cr5,.Ldiffoffset_vmx_cmp 537*4882a593SmuzhiyunEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 538*4882a593Smuzhiyun 539*4882a593Smuzhiyun.Ldiffoffset_novmx_cmp: 540*4882a593Smuzhiyun#endif 541*4882a593Smuzhiyun 542*4882a593Smuzhiyun 543*4882a593Smuzhiyun cmpdi cr5,r5,31 544*4882a593Smuzhiyun ble cr5,.Lcmp_lt32bytes 545*4882a593Smuzhiyun 546*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 547*4882a593Smuzhiyun b .Llong_novmx_cmp 548*4882a593Smuzhiyun#else 549*4882a593Smuzhiyun b .Llong 550*4882a593Smuzhiyun#endif 551*4882a593Smuzhiyun 552*4882a593Smuzhiyun#ifdef CONFIG_ALTIVEC 553*4882a593Smuzhiyun.Ldiffoffset_vmx_cmp: 554*4882a593Smuzhiyun /* perform a 32 bytes pre-checking before 555*4882a593Smuzhiyun * enable VMX operations. 556*4882a593Smuzhiyun */ 557*4882a593Smuzhiyun li r0,4 558*4882a593Smuzhiyun mtctr r0 559*4882a593Smuzhiyun.Ldiffoffset_prechk_32B_loop: 560*4882a593Smuzhiyun LD rA,0,r3 561*4882a593Smuzhiyun LD rB,0,r4 562*4882a593Smuzhiyun cmpld cr0,rA,rB 563*4882a593Smuzhiyun addi r3,r3,8 564*4882a593Smuzhiyun addi r4,r4,8 565*4882a593Smuzhiyun bne cr0,.LcmpAB_lightweight 566*4882a593Smuzhiyun addi r5,r5,-8 567*4882a593Smuzhiyun bdnz .Ldiffoffset_prechk_32B_loop 568*4882a593Smuzhiyun 569*4882a593Smuzhiyun ENTER_VMX_OPS 570*4882a593Smuzhiyun beq cr1,.Ldiffoffset_novmx_cmp 571*4882a593Smuzhiyun 572*4882a593Smuzhiyun.Ldiffoffset_vmx_cmp_start: 573*4882a593Smuzhiyun /* Firstly try to align r3 with 16 bytes */ 574*4882a593Smuzhiyun andi. r6,r3,0xf 575*4882a593Smuzhiyun li off16,16 576*4882a593Smuzhiyun beq .Ldiffoffset_vmx_s1_16bytes_align 577*4882a593Smuzhiyun 578*4882a593Smuzhiyun LVS v3,0,r3 579*4882a593Smuzhiyun LVS v4,0,r4 580*4882a593Smuzhiyun 581*4882a593Smuzhiyun lvx v5,0,r3 582*4882a593Smuzhiyun lvx v6,0,r4 583*4882a593Smuzhiyun LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 584*4882a593Smuzhiyun LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 585*4882a593Smuzhiyun 586*4882a593Smuzhiyun VCMPEQUB_RC(v7,v9,v10) 587*4882a593Smuzhiyun bnl cr6,.Ldiffoffset_vmx_diff_found 588*4882a593Smuzhiyun 589*4882a593Smuzhiyun subfic r6,r6,16 590*4882a593Smuzhiyun subf r5,r6,r5 591*4882a593Smuzhiyun add r3,r3,r6 592*4882a593Smuzhiyun add r4,r4,r6 593*4882a593Smuzhiyun 594*4882a593Smuzhiyun.Ldiffoffset_vmx_s1_16bytes_align: 595*4882a593Smuzhiyun /* now s1 is aligned with 16 bytes */ 596*4882a593Smuzhiyun lvx v6,0,r4 597*4882a593Smuzhiyun LVS v4,0,r4 598*4882a593Smuzhiyun srdi r6,r5,5 /* loop for 32 bytes each */ 599*4882a593Smuzhiyun clrldi r5,r5,59 600*4882a593Smuzhiyun mtctr r6 601*4882a593Smuzhiyun 602*4882a593Smuzhiyun.balign 16 603*4882a593Smuzhiyun.Ldiffoffset_vmx_32bytesloop: 604*4882a593Smuzhiyun /* the first qw of r4 was saved in v6 */ 605*4882a593Smuzhiyun lvx v9,0,r3 606*4882a593Smuzhiyun LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 607*4882a593Smuzhiyun VCMPEQUB_RC(v7,v9,v10) 608*4882a593Smuzhiyun vor v6,v8,v8 609*4882a593Smuzhiyun bnl cr6,.Ldiffoffset_vmx_diff_found 610*4882a593Smuzhiyun 611*4882a593Smuzhiyun addi r3,r3,16 612*4882a593Smuzhiyun addi r4,r4,16 613*4882a593Smuzhiyun 614*4882a593Smuzhiyun lvx v9,0,r3 615*4882a593Smuzhiyun LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 616*4882a593Smuzhiyun VCMPEQUB_RC(v7,v9,v10) 617*4882a593Smuzhiyun vor v6,v8,v8 618*4882a593Smuzhiyun bnl cr6,.Ldiffoffset_vmx_diff_found 619*4882a593Smuzhiyun 620*4882a593Smuzhiyun addi r3,r3,16 621*4882a593Smuzhiyun addi r4,r4,16 622*4882a593Smuzhiyun 623*4882a593Smuzhiyun bdnz .Ldiffoffset_vmx_32bytesloop 624*4882a593Smuzhiyun 625*4882a593Smuzhiyun EXIT_VMX_OPS 626*4882a593Smuzhiyun 627*4882a593Smuzhiyun cmpdi r5,0 628*4882a593Smuzhiyun beq .Lzero 629*4882a593Smuzhiyun b .Lcmp_lt32bytes 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun.Ldiffoffset_vmx_diff_found: 632*4882a593Smuzhiyun EXIT_VMX_OPS 633*4882a593Smuzhiyun /* anyway, the diff will appear in next 16 bytes */ 634*4882a593Smuzhiyun li r5,16 635*4882a593Smuzhiyun b .Lcmp_lt32bytes 636*4882a593Smuzhiyun 637*4882a593Smuzhiyun#endif 638*4882a593SmuzhiyunEXPORT_SYMBOL(memcmp) 639