1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * This routine clears to zero a linear memory buffer in user space. 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Inputs: 6*4882a593Smuzhiyun * in0: address of buffer 7*4882a593Smuzhiyun * in1: length of buffer in bytes 8*4882a593Smuzhiyun * Outputs: 9*4882a593Smuzhiyun * r8: number of bytes that didn't get cleared due to a fault 10*4882a593Smuzhiyun * 11*4882a593Smuzhiyun * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co 12*4882a593Smuzhiyun * Stephane Eranian <eranian@hpl.hp.com> 13*4882a593Smuzhiyun */ 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun#include <asm/asmmacro.h> 16*4882a593Smuzhiyun#include <asm/export.h> 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun// 19*4882a593Smuzhiyun// arguments 20*4882a593Smuzhiyun// 21*4882a593Smuzhiyun#define buf r32 22*4882a593Smuzhiyun#define len r33 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun// 25*4882a593Smuzhiyun// local registers 26*4882a593Smuzhiyun// 27*4882a593Smuzhiyun#define cnt r16 28*4882a593Smuzhiyun#define buf2 r17 29*4882a593Smuzhiyun#define saved_lc r18 30*4882a593Smuzhiyun#define saved_pfs r19 31*4882a593Smuzhiyun#define tmp r20 32*4882a593Smuzhiyun#define len2 r21 33*4882a593Smuzhiyun#define len3 r22 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun// 36*4882a593Smuzhiyun// Theory of operations: 37*4882a593Smuzhiyun// - we check whether or not the buffer is small, i.e., less than 17 38*4882a593Smuzhiyun// in which case we do the byte by byte loop. 39*4882a593Smuzhiyun// 40*4882a593Smuzhiyun// - Otherwise we go progressively from 1 byte store to 8byte store in 41*4882a593Smuzhiyun// the head part, the body is a 16byte store loop and we finish we the 42*4882a593Smuzhiyun// tail for the last 15 bytes. 43*4882a593Smuzhiyun// The good point about this breakdown is that the long buffer handling 44*4882a593Smuzhiyun// contains only 2 branches. 45*4882a593Smuzhiyun// 46*4882a593Smuzhiyun// The reason for not using shifting & masking for both the head and the 47*4882a593Smuzhiyun// tail is to stay semantically correct. This routine is not supposed 48*4882a593Smuzhiyun// to write bytes outside of the buffer. While most of the time this would 49*4882a593Smuzhiyun// be ok, we can't tolerate a mistake. A classical example is the case 50*4882a593Smuzhiyun// of multithreaded code were to the extra bytes touched is actually owned 51*4882a593Smuzhiyun// by another thread which runs concurrently to ours. Another, less likely, 52*4882a593Smuzhiyun// example is with device drivers where reading an I/O mapped location may 53*4882a593Smuzhiyun// have side effects (same thing for writing). 54*4882a593Smuzhiyun// 55*4882a593Smuzhiyun 56*4882a593SmuzhiyunGLOBAL_ENTRY(__do_clear_user) 57*4882a593Smuzhiyun .prologue 58*4882a593Smuzhiyun .save ar.pfs, saved_pfs 59*4882a593Smuzhiyun alloc saved_pfs=ar.pfs,2,0,0,0 60*4882a593Smuzhiyun cmp.eq p6,p0=r0,len // check for zero length 61*4882a593Smuzhiyun .save ar.lc, saved_lc 62*4882a593Smuzhiyun mov saved_lc=ar.lc // preserve ar.lc (slow) 63*4882a593Smuzhiyun .body 64*4882a593Smuzhiyun ;; // avoid WAW on CFM 65*4882a593Smuzhiyun adds tmp=-1,len // br.ctop is repeat/until 66*4882a593Smuzhiyun mov ret0=len // return value is length at this point 67*4882a593Smuzhiyun(p6) br.ret.spnt.many rp 68*4882a593Smuzhiyun ;; 69*4882a593Smuzhiyun cmp.lt p6,p0=16,len // if len > 16 then long memset 70*4882a593Smuzhiyun mov ar.lc=tmp // initialize lc for small count 71*4882a593Smuzhiyun(p6) br.cond.dptk .long_do_clear 72*4882a593Smuzhiyun ;; // WAR on ar.lc 73*4882a593Smuzhiyun // 74*4882a593Smuzhiyun // worst case 16 iterations, avg 8 iterations 75*4882a593Smuzhiyun // 76*4882a593Smuzhiyun // We could have played with the predicates to use the extra 77*4882a593Smuzhiyun // M slot for 2 stores/iteration but the cost the initialization 78*4882a593Smuzhiyun // the various counters compared to how long the loop is supposed 79*4882a593Smuzhiyun // to last on average does not make this solution viable. 80*4882a593Smuzhiyun // 81*4882a593Smuzhiyun1: 82*4882a593Smuzhiyun EX( .Lexit1, st1 [buf]=r0,1 ) 83*4882a593Smuzhiyun adds len=-1,len // countdown length using len 84*4882a593Smuzhiyun br.cloop.dptk 1b 85*4882a593Smuzhiyun ;; // avoid RAW on ar.lc 86*4882a593Smuzhiyun // 87*4882a593Smuzhiyun // .Lexit4: comes from byte by byte loop 88*4882a593Smuzhiyun // len contains bytes left 89*4882a593Smuzhiyun.Lexit1: 90*4882a593Smuzhiyun mov ret0=len // faster than using ar.lc 91*4882a593Smuzhiyun mov ar.lc=saved_lc 92*4882a593Smuzhiyun br.ret.sptk.many rp // end of short clear_user 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun // 96*4882a593Smuzhiyun // At this point we know we have more than 16 bytes to copy 97*4882a593Smuzhiyun // so we focus on alignment (no branches required) 98*4882a593Smuzhiyun // 99*4882a593Smuzhiyun // The use of len/len2 for countdown of the number of bytes left 100*4882a593Smuzhiyun // instead of ret0 is due to the fact that the exception code 101*4882a593Smuzhiyun // changes the values of r8. 102*4882a593Smuzhiyun // 103*4882a593Smuzhiyun.long_do_clear: 104*4882a593Smuzhiyun tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) 105*4882a593Smuzhiyun ;; 106*4882a593Smuzhiyun EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned 107*4882a593Smuzhiyun(p6) adds len=-1,len;; // sync because buf is modified 108*4882a593Smuzhiyun tbit.nz p6,p0=buf,1 109*4882a593Smuzhiyun ;; 110*4882a593Smuzhiyun EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned 111*4882a593Smuzhiyun(p6) adds len=-2,len;; 112*4882a593Smuzhiyun tbit.nz p6,p0=buf,2 113*4882a593Smuzhiyun ;; 114*4882a593Smuzhiyun EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned 115*4882a593Smuzhiyun(p6) adds len=-4,len;; 116*4882a593Smuzhiyun tbit.nz p6,p0=buf,3 117*4882a593Smuzhiyun ;; 118*4882a593Smuzhiyun EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned 119*4882a593Smuzhiyun(p6) adds len=-8,len;; 120*4882a593Smuzhiyun shr.u cnt=len,4 // number of 128-bit (2x64bit) words 121*4882a593Smuzhiyun ;; 122*4882a593Smuzhiyun cmp.eq p6,p0=r0,cnt 123*4882a593Smuzhiyun adds tmp=-1,cnt 124*4882a593Smuzhiyun(p6) br.cond.dpnt .dotail // we have less than 16 bytes left 125*4882a593Smuzhiyun ;; 126*4882a593Smuzhiyun adds buf2=8,buf // setup second base pointer 127*4882a593Smuzhiyun mov ar.lc=tmp 128*4882a593Smuzhiyun ;; 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun // 131*4882a593Smuzhiyun // 16bytes/iteration core loop 132*4882a593Smuzhiyun // 133*4882a593Smuzhiyun // The second store can never generate a fault because 134*4882a593Smuzhiyun // we come into the loop only when we are 16-byte aligned. 135*4882a593Smuzhiyun // This means that if we cross a page then it will always be 136*4882a593Smuzhiyun // in the first store and never in the second. 137*4882a593Smuzhiyun // 138*4882a593Smuzhiyun // 139*4882a593Smuzhiyun // We need to keep track of the remaining length. A possible (optimistic) 140*4882a593Smuzhiyun // way would be to use ar.lc and derive how many byte were left by 141*4882a593Smuzhiyun // doing : left= 16*ar.lc + 16. this would avoid the addition at 142*4882a593Smuzhiyun // every iteration. 143*4882a593Smuzhiyun // However we need to keep the synchronization point. A template 144*4882a593Smuzhiyun // M;;MB does not exist and thus we can keep the addition at no 145*4882a593Smuzhiyun // extra cycle cost (use a nop slot anyway). It also simplifies the 146*4882a593Smuzhiyun // (unlikely) error recovery code 147*4882a593Smuzhiyun // 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun2: EX(.Lexit3, st8 [buf]=r0,16 ) 150*4882a593Smuzhiyun ;; // needed to get len correct when error 151*4882a593Smuzhiyun st8 [buf2]=r0,16 152*4882a593Smuzhiyun adds len=-16,len 153*4882a593Smuzhiyun br.cloop.dptk 2b 154*4882a593Smuzhiyun ;; 155*4882a593Smuzhiyun mov ar.lc=saved_lc 156*4882a593Smuzhiyun // 157*4882a593Smuzhiyun // tail correction based on len only 158*4882a593Smuzhiyun // 159*4882a593Smuzhiyun // We alternate the use of len3,len2 to allow parallelism and correct 160*4882a593Smuzhiyun // error handling. We also reuse p6/p7 to return correct value. 161*4882a593Smuzhiyun // The addition of len2/len3 does not cost anything more compared to 162*4882a593Smuzhiyun // the regular memset as we had empty slots. 163*4882a593Smuzhiyun // 164*4882a593Smuzhiyun.dotail: 165*4882a593Smuzhiyun mov len2=len // for parallelization of error handling 166*4882a593Smuzhiyun mov len3=len 167*4882a593Smuzhiyun tbit.nz p6,p0=len,3 168*4882a593Smuzhiyun ;; 169*4882a593Smuzhiyun EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes 170*4882a593Smuzhiyun(p6) adds len3=-8,len2 171*4882a593Smuzhiyun tbit.nz p7,p6=len,2 172*4882a593Smuzhiyun ;; 173*4882a593Smuzhiyun EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes 174*4882a593Smuzhiyun(p7) adds len2=-4,len3 175*4882a593Smuzhiyun tbit.nz p6,p7=len,1 176*4882a593Smuzhiyun ;; 177*4882a593Smuzhiyun EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes 178*4882a593Smuzhiyun(p6) adds len3=-2,len2 179*4882a593Smuzhiyun tbit.nz p7,p6=len,0 180*4882a593Smuzhiyun ;; 181*4882a593Smuzhiyun EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left 182*4882a593Smuzhiyun mov ret0=r0 // success 183*4882a593Smuzhiyun br.ret.sptk.many rp // end of most likely path 184*4882a593Smuzhiyun 185*4882a593Smuzhiyun // 186*4882a593Smuzhiyun // Outlined error handling code 187*4882a593Smuzhiyun // 188*4882a593Smuzhiyun 189*4882a593Smuzhiyun // 190*4882a593Smuzhiyun // .Lexit3: comes from core loop, need restore pr/lc 191*4882a593Smuzhiyun // len contains bytes left 192*4882a593Smuzhiyun // 193*4882a593Smuzhiyun // 194*4882a593Smuzhiyun // .Lexit2: 195*4882a593Smuzhiyun // if p6 -> coming from st8 or st2 : len2 contains what's left 196*4882a593Smuzhiyun // if p7 -> coming from st4 or st1 : len3 contains what's left 197*4882a593Smuzhiyun // We must restore lc/pr even though might not have been used. 198*4882a593Smuzhiyun.Lexit2: 199*4882a593Smuzhiyun .pred.rel "mutex", p6, p7 200*4882a593Smuzhiyun(p6) mov len=len2 201*4882a593Smuzhiyun(p7) mov len=len3 202*4882a593Smuzhiyun ;; 203*4882a593Smuzhiyun // 204*4882a593Smuzhiyun // .Lexit4: comes from head, need not restore pr/lc 205*4882a593Smuzhiyun // len contains bytes left 206*4882a593Smuzhiyun // 207*4882a593Smuzhiyun.Lexit3: 208*4882a593Smuzhiyun mov ret0=len 209*4882a593Smuzhiyun mov ar.lc=saved_lc 210*4882a593Smuzhiyun br.ret.sptk.many rp 211*4882a593SmuzhiyunEND(__do_clear_user) 212*4882a593SmuzhiyunEXPORT_SYMBOL(__do_clear_user) 213