1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun #ifndef __MIPS_ASM_SYNC_H__ 3*4882a593Smuzhiyun #define __MIPS_ASM_SYNC_H__ 4*4882a593Smuzhiyun 5*4882a593Smuzhiyun /* 6*4882a593Smuzhiyun * sync types are defined by the MIPS64 Instruction Set documentation in Volume 7*4882a593Smuzhiyun * II-A of the MIPS Architecture Reference Manual, which can be found here: 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * https://www.mips.com/?do-download=the-mips64-instruction-set-v6-06 10*4882a593Smuzhiyun * 11*4882a593Smuzhiyun * Two types of barrier are provided: 12*4882a593Smuzhiyun * 13*4882a593Smuzhiyun * 1) Completion barriers, which ensure that a memory operation has actually 14*4882a593Smuzhiyun * completed & often involve stalling the CPU pipeline to do so. 15*4882a593Smuzhiyun * 16*4882a593Smuzhiyun * 2) Ordering barriers, which only ensure that affected memory operations 17*4882a593Smuzhiyun * won't be reordered in the CPU pipeline in a manner that violates the 18*4882a593Smuzhiyun * restrictions imposed by the barrier. 19*4882a593Smuzhiyun * 20*4882a593Smuzhiyun * Ordering barriers can be more efficient than completion barriers, since: 21*4882a593Smuzhiyun * 22*4882a593Smuzhiyun * a) Ordering barriers only require memory access instructions which preceed 23*4882a593Smuzhiyun * them in program order (older instructions) to reach a point in the 24*4882a593Smuzhiyun * load/store datapath beyond which reordering is not possible before 25*4882a593Smuzhiyun * allowing memory access instructions which follow them (younger 26*4882a593Smuzhiyun * instructions) to be performed. That is, older instructions don't 27*4882a593Smuzhiyun * actually need to complete - they just need to get far enough that all 28*4882a593Smuzhiyun * other coherent CPUs will observe their completion before they observe 29*4882a593Smuzhiyun * the effects of younger instructions. 30*4882a593Smuzhiyun * 31*4882a593Smuzhiyun * b) Multiple variants of ordering barrier are provided which allow the 32*4882a593Smuzhiyun * effects to be restricted to different combinations of older or younger 33*4882a593Smuzhiyun * loads or stores. By way of example, if we only care that stores older 34*4882a593Smuzhiyun * than a barrier are observed prior to stores that are younger than a 35*4882a593Smuzhiyun * barrier & don't care about the ordering of loads then the 'wmb' 36*4882a593Smuzhiyun * ordering barrier can be used. Limiting the barrier's effects to stores 37*4882a593Smuzhiyun * allows loads to continue unaffected & potentially allows the CPU to 38*4882a593Smuzhiyun * make progress faster than if younger loads had to wait for older stores 39*4882a593Smuzhiyun * to complete. 40*4882a593Smuzhiyun */ 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun /* 43*4882a593Smuzhiyun * No sync instruction at all; used to allow code to nullify the effect of the 44*4882a593Smuzhiyun * __SYNC() macro without needing lots of #ifdefery. 45*4882a593Smuzhiyun */ 46*4882a593Smuzhiyun #define __SYNC_none -1 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun /* 49*4882a593Smuzhiyun * A full completion barrier; all memory accesses appearing prior to this sync 50*4882a593Smuzhiyun * instruction in program order must complete before any memory accesses 51*4882a593Smuzhiyun * appearing after this sync instruction in program order. 52*4882a593Smuzhiyun */ 53*4882a593Smuzhiyun #define __SYNC_full 0x00 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun /* 56*4882a593Smuzhiyun * For now we use a full completion barrier to implement all sync types, until 57*4882a593Smuzhiyun * we're satisfied that lightweight ordering barriers defined by MIPSr6 are 58*4882a593Smuzhiyun * sufficient to uphold our desired memory model. 59*4882a593Smuzhiyun */ 60*4882a593Smuzhiyun #define __SYNC_aq __SYNC_full 61*4882a593Smuzhiyun #define __SYNC_rl __SYNC_full 62*4882a593Smuzhiyun #define __SYNC_mb __SYNC_full 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun /* 65*4882a593Smuzhiyun * ...except on Cavium Octeon CPUs, which have been using the 'wmb' ordering 66*4882a593Smuzhiyun * barrier since 2010 & omit 'rmb' barriers because the CPUs don't perform 67*4882a593Smuzhiyun * speculative reads. 68*4882a593Smuzhiyun */ 69*4882a593Smuzhiyun #ifdef CONFIG_CPU_CAVIUM_OCTEON 70*4882a593Smuzhiyun # define __SYNC_rmb __SYNC_none 71*4882a593Smuzhiyun # define __SYNC_wmb 0x04 72*4882a593Smuzhiyun #else 73*4882a593Smuzhiyun # define __SYNC_rmb __SYNC_full 74*4882a593Smuzhiyun # define __SYNC_wmb __SYNC_full 75*4882a593Smuzhiyun #endif 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun /* 78*4882a593Smuzhiyun * A GINV sync is a little different; it doesn't relate directly to loads or 79*4882a593Smuzhiyun * stores, but instead causes synchronization of an icache or TLB global 80*4882a593Smuzhiyun * invalidation operation triggered by the ginvi or ginvt instructions 81*4882a593Smuzhiyun * respectively. In cases where we need to know that a ginvi or ginvt operation 82*4882a593Smuzhiyun * has been performed by all coherent CPUs, we must issue a sync instruction of 83*4882a593Smuzhiyun * this type. Once this instruction graduates all coherent CPUs will have 84*4882a593Smuzhiyun * observed the invalidation. 85*4882a593Smuzhiyun */ 86*4882a593Smuzhiyun #define __SYNC_ginv 0x14 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun /* Trivial; indicate that we always need this sync instruction. */ 89*4882a593Smuzhiyun #define __SYNC_always (1 << 0) 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun /* 92*4882a593Smuzhiyun * Indicate that we need this sync instruction only on systems with weakly 93*4882a593Smuzhiyun * ordered memory access. In general this is most MIPS systems, but there are 94*4882a593Smuzhiyun * exceptions which provide strongly ordered memory. 95*4882a593Smuzhiyun */ 96*4882a593Smuzhiyun #ifdef CONFIG_WEAK_ORDERING 97*4882a593Smuzhiyun # define __SYNC_weak_ordering (1 << 1) 98*4882a593Smuzhiyun #else 99*4882a593Smuzhiyun # define __SYNC_weak_ordering 0 100*4882a593Smuzhiyun #endif 101*4882a593Smuzhiyun 102*4882a593Smuzhiyun /* 103*4882a593Smuzhiyun * Indicate that we need this sync instruction only on systems where LL/SC 104*4882a593Smuzhiyun * don't implicitly provide a memory barrier. In general this is most MIPS 105*4882a593Smuzhiyun * systems. 106*4882a593Smuzhiyun */ 107*4882a593Smuzhiyun #ifdef CONFIG_WEAK_REORDERING_BEYOND_LLSC 108*4882a593Smuzhiyun # define __SYNC_weak_llsc (1 << 2) 109*4882a593Smuzhiyun #else 110*4882a593Smuzhiyun # define __SYNC_weak_llsc 0 111*4882a593Smuzhiyun #endif 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun /* 114*4882a593Smuzhiyun * Some Loongson 3 CPUs have a bug wherein execution of a memory access (load, 115*4882a593Smuzhiyun * store or prefetch) in between an LL & SC can cause the SC instruction to 116*4882a593Smuzhiyun * erroneously succeed, breaking atomicity. Whilst it's unusual to write code 117*4882a593Smuzhiyun * containing such sequences, this bug bites harder than we might otherwise 118*4882a593Smuzhiyun * expect due to reordering & speculation: 119*4882a593Smuzhiyun * 120*4882a593Smuzhiyun * 1) A memory access appearing prior to the LL in program order may actually 121*4882a593Smuzhiyun * be executed after the LL - this is the reordering case. 122*4882a593Smuzhiyun * 123*4882a593Smuzhiyun * In order to avoid this we need to place a memory barrier (ie. a SYNC 124*4882a593Smuzhiyun * instruction) prior to every LL instruction, in between it and any earlier 125*4882a593Smuzhiyun * memory access instructions. 126*4882a593Smuzhiyun * 127*4882a593Smuzhiyun * This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later. 128*4882a593Smuzhiyun * 129*4882a593Smuzhiyun * 2) If a conditional branch exists between an LL & SC with a target outside 130*4882a593Smuzhiyun * of the LL-SC loop, for example an exit upon value mismatch in cmpxchg() 131*4882a593Smuzhiyun * or similar, then misprediction of the branch may allow speculative 132*4882a593Smuzhiyun * execution of memory accesses from outside of the LL-SC loop. 133*4882a593Smuzhiyun * 134*4882a593Smuzhiyun * In order to avoid this we need a memory barrier (ie. a SYNC instruction) 135*4882a593Smuzhiyun * at each affected branch target. 136*4882a593Smuzhiyun * 137*4882a593Smuzhiyun * This case affects all current Loongson 3 CPUs. 138*4882a593Smuzhiyun * 139*4882a593Smuzhiyun * The above described cases cause an error in the cache coherence protocol; 140*4882a593Smuzhiyun * such that the Invalidate of a competing LL-SC goes 'missing' and SC 141*4882a593Smuzhiyun * erroneously observes its core still has Exclusive state and lets the SC 142*4882a593Smuzhiyun * proceed. 143*4882a593Smuzhiyun * 144*4882a593Smuzhiyun * Therefore the error only occurs on SMP systems. 145*4882a593Smuzhiyun */ 146*4882a593Smuzhiyun #ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS 147*4882a593Smuzhiyun # define __SYNC_loongson3_war (1 << 31) 148*4882a593Smuzhiyun #else 149*4882a593Smuzhiyun # define __SYNC_loongson3_war 0 150*4882a593Smuzhiyun #endif 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun /* 153*4882a593Smuzhiyun * Some Cavium Octeon CPUs suffer from a bug that causes a single wmb ordering 154*4882a593Smuzhiyun * barrier to be ineffective, requiring the use of 2 in sequence to provide an 155*4882a593Smuzhiyun * effective barrier as noted by commit 6b07d38aaa52 ("MIPS: Octeon: Use 156*4882a593Smuzhiyun * optimized memory barrier primitives."). Here we specify that the affected 157*4882a593Smuzhiyun * sync instructions should be emitted twice. 158*4882a593Smuzhiyun * Note that this expression is evaluated by the assembler (not the compiler), 159*4882a593Smuzhiyun * and that the assembler evaluates '==' as 0 or -1, not 0 or 1. 160*4882a593Smuzhiyun */ 161*4882a593Smuzhiyun #ifdef CONFIG_CPU_CAVIUM_OCTEON 162*4882a593Smuzhiyun # define __SYNC_rpt(type) (1 - (type == __SYNC_wmb)) 163*4882a593Smuzhiyun #else 164*4882a593Smuzhiyun # define __SYNC_rpt(type) 1 165*4882a593Smuzhiyun #endif 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun /* 168*4882a593Smuzhiyun * The main event. Here we actually emit a sync instruction of a given type, if 169*4882a593Smuzhiyun * reason is non-zero. 170*4882a593Smuzhiyun * 171*4882a593Smuzhiyun * In future we have the option of emitting entries in a fixups-style table 172*4882a593Smuzhiyun * here that would allow us to opportunistically remove some sync instructions 173*4882a593Smuzhiyun * when we detect at runtime that we're running on a CPU that doesn't need 174*4882a593Smuzhiyun * them. 175*4882a593Smuzhiyun */ 176*4882a593Smuzhiyun #ifdef CONFIG_CPU_HAS_SYNC 177*4882a593Smuzhiyun # define ____SYNC(_type, _reason, _else) \ 178*4882a593Smuzhiyun .if (( _type ) != -1) && ( _reason ); \ 179*4882a593Smuzhiyun .set push; \ 180*4882a593Smuzhiyun .set MIPS_ISA_LEVEL_RAW; \ 181*4882a593Smuzhiyun .rept __SYNC_rpt(_type); \ 182*4882a593Smuzhiyun sync _type; \ 183*4882a593Smuzhiyun .endr; \ 184*4882a593Smuzhiyun .set pop; \ 185*4882a593Smuzhiyun .else; \ 186*4882a593Smuzhiyun _else; \ 187*4882a593Smuzhiyun .endif 188*4882a593Smuzhiyun #else 189*4882a593Smuzhiyun # define ____SYNC(_type, _reason, _else) 190*4882a593Smuzhiyun #endif 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun /* 193*4882a593Smuzhiyun * Preprocessor magic to expand macros used as arguments before we insert them 194*4882a593Smuzhiyun * into assembly code. 195*4882a593Smuzhiyun */ 196*4882a593Smuzhiyun #ifdef __ASSEMBLY__ 197*4882a593Smuzhiyun # define ___SYNC(type, reason, else) \ 198*4882a593Smuzhiyun ____SYNC(type, reason, else) 199*4882a593Smuzhiyun #else 200*4882a593Smuzhiyun # define ___SYNC(type, reason, else) \ 201*4882a593Smuzhiyun __stringify(____SYNC(type, reason, else)) 202*4882a593Smuzhiyun #endif 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun #define __SYNC(type, reason) \ 205*4882a593Smuzhiyun ___SYNC(__SYNC_##type, __SYNC_##reason, ) 206*4882a593Smuzhiyun #define __SYNC_ELSE(type, reason, else) \ 207*4882a593Smuzhiyun ___SYNC(__SYNC_##type, __SYNC_##reason, else) 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun #endif /* __MIPS_ASM_SYNC_H__ */ 210