1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun#include <asm/processor.h> 3*4882a593Smuzhiyun#include <asm/ppc_asm.h> 4*4882a593Smuzhiyun#include <asm/reg.h> 5*4882a593Smuzhiyun#include <asm/asm-offsets.h> 6*4882a593Smuzhiyun#include <asm/cputable.h> 7*4882a593Smuzhiyun#include <asm/thread_info.h> 8*4882a593Smuzhiyun#include <asm/page.h> 9*4882a593Smuzhiyun#include <asm/ptrace.h> 10*4882a593Smuzhiyun#include <asm/export.h> 11*4882a593Smuzhiyun#include <asm/asm-compat.h> 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun/* 14*4882a593Smuzhiyun * Load state from memory into VMX registers including VSCR. 15*4882a593Smuzhiyun * Assumes the caller has enabled VMX in the MSR. 16*4882a593Smuzhiyun */ 17*4882a593Smuzhiyun_GLOBAL(load_vr_state) 18*4882a593Smuzhiyun li r4,VRSTATE_VSCR 19*4882a593Smuzhiyun lvx v0,r4,r3 20*4882a593Smuzhiyun mtvscr v0 21*4882a593Smuzhiyun REST_32VRS(0,r4,r3) 22*4882a593Smuzhiyun blr 23*4882a593SmuzhiyunEXPORT_SYMBOL(load_vr_state) 24*4882a593Smuzhiyun_ASM_NOKPROBE_SYMBOL(load_vr_state); /* used by restore_math */ 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun/* 27*4882a593Smuzhiyun * Store VMX state into memory, including VSCR. 28*4882a593Smuzhiyun * Assumes the caller has enabled VMX in the MSR. 29*4882a593Smuzhiyun */ 30*4882a593Smuzhiyun_GLOBAL(store_vr_state) 31*4882a593Smuzhiyun SAVE_32VRS(0, r4, r3) 32*4882a593Smuzhiyun mfvscr v0 33*4882a593Smuzhiyun li r4, VRSTATE_VSCR 34*4882a593Smuzhiyun stvx v0, r4, r3 35*4882a593Smuzhiyun blr 36*4882a593SmuzhiyunEXPORT_SYMBOL(store_vr_state) 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun/* 39*4882a593Smuzhiyun * Disable VMX for the task which had it previously, 40*4882a593Smuzhiyun * and save its vector registers in its thread_struct. 41*4882a593Smuzhiyun * Enables the VMX for use in the kernel on return. 42*4882a593Smuzhiyun * On SMP we know the VMX is free, since we give it up every 43*4882a593Smuzhiyun * switch (ie, no lazy save of the vector registers). 44*4882a593Smuzhiyun * 45*4882a593Smuzhiyun * Note that on 32-bit this can only use registers that will be 46*4882a593Smuzhiyun * restored by fast_exception_return, i.e. r3 - r6, r10 and r11. 47*4882a593Smuzhiyun */ 48*4882a593Smuzhiyun_GLOBAL(load_up_altivec) 49*4882a593Smuzhiyun mfmsr r5 /* grab the current MSR */ 50*4882a593Smuzhiyun oris r5,r5,MSR_VEC@h 51*4882a593Smuzhiyun MTMSRD(r5) /* enable use of AltiVec now */ 52*4882a593Smuzhiyun isync 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun /* 55*4882a593Smuzhiyun * While userspace in general ignores VRSAVE, glibc uses it as a boolean 56*4882a593Smuzhiyun * to optimise userspace context save/restore. Whenever we take an 57*4882a593Smuzhiyun * altivec unavailable exception we must set VRSAVE to something non 58*4882a593Smuzhiyun * zero. Set it to all 1s. See also the programming note in the ISA. 59*4882a593Smuzhiyun */ 60*4882a593Smuzhiyun mfspr r4,SPRN_VRSAVE 61*4882a593Smuzhiyun cmpwi 0,r4,0 62*4882a593Smuzhiyun bne+ 1f 63*4882a593Smuzhiyun li r4,-1 64*4882a593Smuzhiyun mtspr SPRN_VRSAVE,r4 65*4882a593Smuzhiyun1: 66*4882a593Smuzhiyun /* enable use of VMX after return */ 67*4882a593Smuzhiyun#ifdef CONFIG_PPC32 68*4882a593Smuzhiyun mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ 69*4882a593Smuzhiyun oris r9,r9,MSR_VEC@h 70*4882a593Smuzhiyun#ifdef CONFIG_VMAP_STACK 71*4882a593Smuzhiyun tovirt(r5, r5) 72*4882a593Smuzhiyun#endif 73*4882a593Smuzhiyun#else 74*4882a593Smuzhiyun ld r4,PACACURRENT(r13) 75*4882a593Smuzhiyun addi r5,r4,THREAD /* Get THREAD */ 76*4882a593Smuzhiyun oris r12,r12,MSR_VEC@h 77*4882a593Smuzhiyun std r12,_MSR(r1) 78*4882a593Smuzhiyun#endif 79*4882a593Smuzhiyun li r4,1 80*4882a593Smuzhiyun stb r4,THREAD_LOAD_VEC(r5) 81*4882a593Smuzhiyun addi r6,r5,THREAD_VRSTATE 82*4882a593Smuzhiyun li r4,1 83*4882a593Smuzhiyun li r10,VRSTATE_VSCR 84*4882a593Smuzhiyun stw r4,THREAD_USED_VR(r5) 85*4882a593Smuzhiyun lvx v0,r10,r6 86*4882a593Smuzhiyun mtvscr v0 87*4882a593Smuzhiyun REST_32VRS(0,r4,r6) 88*4882a593Smuzhiyun /* restore registers and return */ 89*4882a593Smuzhiyun blr 90*4882a593Smuzhiyun_ASM_NOKPROBE_SYMBOL(load_up_altivec) 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun/* 93*4882a593Smuzhiyun * save_altivec(tsk) 94*4882a593Smuzhiyun * Save the vector registers to its thread_struct 95*4882a593Smuzhiyun */ 96*4882a593Smuzhiyun_GLOBAL(save_altivec) 97*4882a593Smuzhiyun addi r3,r3,THREAD /* want THREAD of task */ 98*4882a593Smuzhiyun PPC_LL r7,THREAD_VRSAVEAREA(r3) 99*4882a593Smuzhiyun PPC_LL r5,PT_REGS(r3) 100*4882a593Smuzhiyun PPC_LCMPI 0,r7,0 101*4882a593Smuzhiyun bne 2f 102*4882a593Smuzhiyun addi r7,r3,THREAD_VRSTATE 103*4882a593Smuzhiyun2: SAVE_32VRS(0,r4,r7) 104*4882a593Smuzhiyun mfvscr v0 105*4882a593Smuzhiyun li r4,VRSTATE_VSCR 106*4882a593Smuzhiyun stvx v0,r4,r7 107*4882a593Smuzhiyun blr 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun#ifdef CONFIG_VSX 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun#ifdef CONFIG_PPC32 112*4882a593Smuzhiyun#error This asm code isn't ready for 32-bit kernels 113*4882a593Smuzhiyun#endif 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun/* 116*4882a593Smuzhiyun * load_up_vsx(unused, unused, tsk) 117*4882a593Smuzhiyun * Disable VSX for the task which had it previously, 118*4882a593Smuzhiyun * and save its vector registers in its thread_struct. 119*4882a593Smuzhiyun * Reuse the fp and vsx saves, but first check to see if they have 120*4882a593Smuzhiyun * been saved already. 121*4882a593Smuzhiyun */ 122*4882a593Smuzhiyun_GLOBAL(load_up_vsx) 123*4882a593Smuzhiyun/* Load FP and VSX registers if they haven't been done yet */ 124*4882a593Smuzhiyun andi. r5,r12,MSR_FP 125*4882a593Smuzhiyun beql+ load_up_fpu /* skip if already loaded */ 126*4882a593Smuzhiyun andis. r5,r12,MSR_VEC@h 127*4882a593Smuzhiyun beql+ load_up_altivec /* skip if already loaded */ 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun ld r4,PACACURRENT(r13) 130*4882a593Smuzhiyun addi r4,r4,THREAD /* Get THREAD */ 131*4882a593Smuzhiyun li r6,1 132*4882a593Smuzhiyun stw r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */ 133*4882a593Smuzhiyun /* enable use of VSX after return */ 134*4882a593Smuzhiyun oris r12,r12,MSR_VSX@h 135*4882a593Smuzhiyun std r12,_MSR(r1) 136*4882a593Smuzhiyun b fast_interrupt_return 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun#endif /* CONFIG_VSX */ 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun/* 142*4882a593Smuzhiyun * The routines below are in assembler so we can closely control the 143*4882a593Smuzhiyun * usage of floating-point registers. These routines must be called 144*4882a593Smuzhiyun * with preempt disabled. 145*4882a593Smuzhiyun */ 146*4882a593Smuzhiyun#ifdef CONFIG_PPC32 147*4882a593Smuzhiyun .data 148*4882a593Smuzhiyunfpzero: 149*4882a593Smuzhiyun .long 0 150*4882a593Smuzhiyunfpone: 151*4882a593Smuzhiyun .long 0x3f800000 /* 1.0 in single-precision FP */ 152*4882a593Smuzhiyunfphalf: 153*4882a593Smuzhiyun .long 0x3f000000 /* 0.5 in single-precision FP */ 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun#define LDCONST(fr, name) \ 156*4882a593Smuzhiyun lis r11,name@ha; \ 157*4882a593Smuzhiyun lfs fr,name@l(r11) 158*4882a593Smuzhiyun#else 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun .section ".toc","aw" 161*4882a593Smuzhiyunfpzero: 162*4882a593Smuzhiyun .tc FD_0_0[TC],0 163*4882a593Smuzhiyunfpone: 164*4882a593Smuzhiyun .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */ 165*4882a593Smuzhiyunfphalf: 166*4882a593Smuzhiyun .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */ 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun#define LDCONST(fr, name) \ 169*4882a593Smuzhiyun lfd fr,name@toc(r2) 170*4882a593Smuzhiyun#endif 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun .text 173*4882a593Smuzhiyun/* 174*4882a593Smuzhiyun * Internal routine to enable floating point and set FPSCR to 0. 175*4882a593Smuzhiyun * Don't call it from C; it doesn't use the normal calling convention. 176*4882a593Smuzhiyun */ 177*4882a593Smuzhiyunfpenable: 178*4882a593Smuzhiyun#ifdef CONFIG_PPC32 179*4882a593Smuzhiyun stwu r1,-64(r1) 180*4882a593Smuzhiyun#else 181*4882a593Smuzhiyun stdu r1,-64(r1) 182*4882a593Smuzhiyun#endif 183*4882a593Smuzhiyun mfmsr r10 184*4882a593Smuzhiyun ori r11,r10,MSR_FP 185*4882a593Smuzhiyun mtmsr r11 186*4882a593Smuzhiyun isync 187*4882a593Smuzhiyun stfd fr0,24(r1) 188*4882a593Smuzhiyun stfd fr1,16(r1) 189*4882a593Smuzhiyun stfd fr31,8(r1) 190*4882a593Smuzhiyun LDCONST(fr1, fpzero) 191*4882a593Smuzhiyun mffs fr31 192*4882a593Smuzhiyun MTFSF_L(fr1) 193*4882a593Smuzhiyun blr 194*4882a593Smuzhiyun 195*4882a593Smuzhiyunfpdisable: 196*4882a593Smuzhiyun mtlr r12 197*4882a593Smuzhiyun MTFSF_L(fr31) 198*4882a593Smuzhiyun lfd fr31,8(r1) 199*4882a593Smuzhiyun lfd fr1,16(r1) 200*4882a593Smuzhiyun lfd fr0,24(r1) 201*4882a593Smuzhiyun mtmsr r10 202*4882a593Smuzhiyun isync 203*4882a593Smuzhiyun addi r1,r1,64 204*4882a593Smuzhiyun blr 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun/* 207*4882a593Smuzhiyun * Vector add, floating point. 208*4882a593Smuzhiyun */ 209*4882a593Smuzhiyun_GLOBAL(vaddfp) 210*4882a593Smuzhiyun mflr r12 211*4882a593Smuzhiyun bl fpenable 212*4882a593Smuzhiyun li r0,4 213*4882a593Smuzhiyun mtctr r0 214*4882a593Smuzhiyun li r6,0 215*4882a593Smuzhiyun1: lfsx fr0,r4,r6 216*4882a593Smuzhiyun lfsx fr1,r5,r6 217*4882a593Smuzhiyun fadds fr0,fr0,fr1 218*4882a593Smuzhiyun stfsx fr0,r3,r6 219*4882a593Smuzhiyun addi r6,r6,4 220*4882a593Smuzhiyun bdnz 1b 221*4882a593Smuzhiyun b fpdisable 222*4882a593Smuzhiyun 223*4882a593Smuzhiyun/* 224*4882a593Smuzhiyun * Vector subtract, floating point. 225*4882a593Smuzhiyun */ 226*4882a593Smuzhiyun_GLOBAL(vsubfp) 227*4882a593Smuzhiyun mflr r12 228*4882a593Smuzhiyun bl fpenable 229*4882a593Smuzhiyun li r0,4 230*4882a593Smuzhiyun mtctr r0 231*4882a593Smuzhiyun li r6,0 232*4882a593Smuzhiyun1: lfsx fr0,r4,r6 233*4882a593Smuzhiyun lfsx fr1,r5,r6 234*4882a593Smuzhiyun fsubs fr0,fr0,fr1 235*4882a593Smuzhiyun stfsx fr0,r3,r6 236*4882a593Smuzhiyun addi r6,r6,4 237*4882a593Smuzhiyun bdnz 1b 238*4882a593Smuzhiyun b fpdisable 239*4882a593Smuzhiyun 240*4882a593Smuzhiyun/* 241*4882a593Smuzhiyun * Vector multiply and add, floating point. 242*4882a593Smuzhiyun */ 243*4882a593Smuzhiyun_GLOBAL(vmaddfp) 244*4882a593Smuzhiyun mflr r12 245*4882a593Smuzhiyun bl fpenable 246*4882a593Smuzhiyun stfd fr2,32(r1) 247*4882a593Smuzhiyun li r0,4 248*4882a593Smuzhiyun mtctr r0 249*4882a593Smuzhiyun li r7,0 250*4882a593Smuzhiyun1: lfsx fr0,r4,r7 251*4882a593Smuzhiyun lfsx fr1,r5,r7 252*4882a593Smuzhiyun lfsx fr2,r6,r7 253*4882a593Smuzhiyun fmadds fr0,fr0,fr2,fr1 254*4882a593Smuzhiyun stfsx fr0,r3,r7 255*4882a593Smuzhiyun addi r7,r7,4 256*4882a593Smuzhiyun bdnz 1b 257*4882a593Smuzhiyun lfd fr2,32(r1) 258*4882a593Smuzhiyun b fpdisable 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun/* 261*4882a593Smuzhiyun * Vector negative multiply and subtract, floating point. 262*4882a593Smuzhiyun */ 263*4882a593Smuzhiyun_GLOBAL(vnmsubfp) 264*4882a593Smuzhiyun mflr r12 265*4882a593Smuzhiyun bl fpenable 266*4882a593Smuzhiyun stfd fr2,32(r1) 267*4882a593Smuzhiyun li r0,4 268*4882a593Smuzhiyun mtctr r0 269*4882a593Smuzhiyun li r7,0 270*4882a593Smuzhiyun1: lfsx fr0,r4,r7 271*4882a593Smuzhiyun lfsx fr1,r5,r7 272*4882a593Smuzhiyun lfsx fr2,r6,r7 273*4882a593Smuzhiyun fnmsubs fr0,fr0,fr2,fr1 274*4882a593Smuzhiyun stfsx fr0,r3,r7 275*4882a593Smuzhiyun addi r7,r7,4 276*4882a593Smuzhiyun bdnz 1b 277*4882a593Smuzhiyun lfd fr2,32(r1) 278*4882a593Smuzhiyun b fpdisable 279*4882a593Smuzhiyun 280*4882a593Smuzhiyun/* 281*4882a593Smuzhiyun * Vector reciprocal estimate. We just compute 1.0/x. 282*4882a593Smuzhiyun * r3 -> destination, r4 -> source. 283*4882a593Smuzhiyun */ 284*4882a593Smuzhiyun_GLOBAL(vrefp) 285*4882a593Smuzhiyun mflr r12 286*4882a593Smuzhiyun bl fpenable 287*4882a593Smuzhiyun li r0,4 288*4882a593Smuzhiyun LDCONST(fr1, fpone) 289*4882a593Smuzhiyun mtctr r0 290*4882a593Smuzhiyun li r6,0 291*4882a593Smuzhiyun1: lfsx fr0,r4,r6 292*4882a593Smuzhiyun fdivs fr0,fr1,fr0 293*4882a593Smuzhiyun stfsx fr0,r3,r6 294*4882a593Smuzhiyun addi r6,r6,4 295*4882a593Smuzhiyun bdnz 1b 296*4882a593Smuzhiyun b fpdisable 297*4882a593Smuzhiyun 298*4882a593Smuzhiyun/* 299*4882a593Smuzhiyun * Vector reciprocal square-root estimate, floating point. 300*4882a593Smuzhiyun * We use the frsqrte instruction for the initial estimate followed 301*4882a593Smuzhiyun * by 2 iterations of Newton-Raphson to get sufficient accuracy. 302*4882a593Smuzhiyun * r3 -> destination, r4 -> source. 303*4882a593Smuzhiyun */ 304*4882a593Smuzhiyun_GLOBAL(vrsqrtefp) 305*4882a593Smuzhiyun mflr r12 306*4882a593Smuzhiyun bl fpenable 307*4882a593Smuzhiyun stfd fr2,32(r1) 308*4882a593Smuzhiyun stfd fr3,40(r1) 309*4882a593Smuzhiyun stfd fr4,48(r1) 310*4882a593Smuzhiyun stfd fr5,56(r1) 311*4882a593Smuzhiyun li r0,4 312*4882a593Smuzhiyun LDCONST(fr4, fpone) 313*4882a593Smuzhiyun LDCONST(fr5, fphalf) 314*4882a593Smuzhiyun mtctr r0 315*4882a593Smuzhiyun li r6,0 316*4882a593Smuzhiyun1: lfsx fr0,r4,r6 317*4882a593Smuzhiyun frsqrte fr1,fr0 /* r = frsqrte(s) */ 318*4882a593Smuzhiyun fmuls fr3,fr1,fr0 /* r * s */ 319*4882a593Smuzhiyun fmuls fr2,fr1,fr5 /* r * 0.5 */ 320*4882a593Smuzhiyun fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */ 321*4882a593Smuzhiyun fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */ 322*4882a593Smuzhiyun fmuls fr3,fr1,fr0 /* r * s */ 323*4882a593Smuzhiyun fmuls fr2,fr1,fr5 /* r * 0.5 */ 324*4882a593Smuzhiyun fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */ 325*4882a593Smuzhiyun fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */ 326*4882a593Smuzhiyun stfsx fr1,r3,r6 327*4882a593Smuzhiyun addi r6,r6,4 328*4882a593Smuzhiyun bdnz 1b 329*4882a593Smuzhiyun lfd fr5,56(r1) 330*4882a593Smuzhiyun lfd fr4,48(r1) 331*4882a593Smuzhiyun lfd fr3,40(r1) 332*4882a593Smuzhiyun lfd fr2,32(r1) 333*4882a593Smuzhiyun b fpdisable 334