1*4882a593Smuzhiyun/* 2*4882a593Smuzhiyun * M7memset.S: SPARC M7 optimized memset. 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun/* 8*4882a593Smuzhiyun * M7memset.S: M7 optimized memset. 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun * char *memset(sp, c, n) 11*4882a593Smuzhiyun * 12*4882a593Smuzhiyun * Set an array of n chars starting at sp to the character c. 13*4882a593Smuzhiyun * Return sp. 14*4882a593Smuzhiyun * 15*4882a593Smuzhiyun * Fast assembler language version of the following C-program for memset 16*4882a593Smuzhiyun * which represents the `standard' for the C-library. 17*4882a593Smuzhiyun * 18*4882a593Smuzhiyun * void * 19*4882a593Smuzhiyun * memset(void *sp1, int c, size_t n) 20*4882a593Smuzhiyun * { 21*4882a593Smuzhiyun * if (n != 0) { 22*4882a593Smuzhiyun * char *sp = sp1; 23*4882a593Smuzhiyun * do { 24*4882a593Smuzhiyun * *sp++ = (char)c; 25*4882a593Smuzhiyun * } while (--n != 0); 26*4882a593Smuzhiyun * } 27*4882a593Smuzhiyun * return (sp1); 28*4882a593Smuzhiyun * } 29*4882a593Smuzhiyun * 30*4882a593Smuzhiyun * The algorithm is as follows : 31*4882a593Smuzhiyun * 32*4882a593Smuzhiyun * For small 6 or fewer bytes stores, bytes will be stored. 33*4882a593Smuzhiyun * 34*4882a593Smuzhiyun * For less than 32 bytes stores, align the address on 4 byte boundary. 35*4882a593Smuzhiyun * Then store as many 4-byte chunks, followed by trailing bytes. 36*4882a593Smuzhiyun * 37*4882a593Smuzhiyun * For sizes greater than 32 bytes, align the address on 8 byte boundary. 38*4882a593Smuzhiyun * if (count >= 64) { 39*4882a593Smuzhiyun * store 8-bytes chunks to align the address on 64 byte boundary 40*4882a593Smuzhiyun * if (value to be set is zero && count >= MIN_ZERO) { 41*4882a593Smuzhiyun * Using BIS stores, set the first long word of each 42*4882a593Smuzhiyun * 64-byte cache line to zero which will also clear the 43*4882a593Smuzhiyun * other seven long words of the cache line. 44*4882a593Smuzhiyun * } 45*4882a593Smuzhiyun * else if (count >= MIN_LOOP) { 46*4882a593Smuzhiyun * Using BIS stores, set the first long word of each of 47*4882a593Smuzhiyun * ST_CHUNK cache lines (64 bytes each) before the main 48*4882a593Smuzhiyun * loop is entered. 49*4882a593Smuzhiyun * In the main loop, continue pre-setting the first long 50*4882a593Smuzhiyun * word of each cache line ST_CHUNK lines in advance while 51*4882a593Smuzhiyun * setting the other seven long words (56 bytes) of each 52*4882a593Smuzhiyun * cache line until fewer than ST_CHUNK*64 bytes remain. 53*4882a593Smuzhiyun * Then set the remaining seven long words of each cache 54*4882a593Smuzhiyun * line that has already had its first long word set. 55*4882a593Smuzhiyun * } 56*4882a593Smuzhiyun * store remaining data in 64-byte chunks until less than 57*4882a593Smuzhiyun * 64 bytes remain. 58*4882a593Smuzhiyun * } 59*4882a593Smuzhiyun * Store as many 8-byte chunks, followed by trailing bytes. 60*4882a593Smuzhiyun * 61*4882a593Smuzhiyun * BIS = Block Init Store 62*4882a593Smuzhiyun * Doing the advance store of the first element of the cache line 63*4882a593Smuzhiyun * initiates the displacement of a cache line while only using a single 64*4882a593Smuzhiyun * instruction in the pipeline. That avoids various pipeline delays, 65*4882a593Smuzhiyun * such as filling the miss buffer. The performance effect is 66*4882a593Smuzhiyun * similar to prefetching for normal stores. 67*4882a593Smuzhiyun * The special case for zero fills runs faster and uses fewer instruction 68*4882a593Smuzhiyun * cycles than the normal memset loop. 69*4882a593Smuzhiyun * 70*4882a593Smuzhiyun * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence 71*4882a593Smuzhiyun * BIS stores must be followed by a membar #StoreStore. The benefit of 72*4882a593Smuzhiyun * the BIS store must be balanced against the cost of the membar operation. 73*4882a593Smuzhiyun */ 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun/* 76*4882a593Smuzhiyun * ASI_STBI_P marks the cache line as "least recently used" 77*4882a593Smuzhiyun * which means if many threads are active, it has a high chance 78*4882a593Smuzhiyun * of being pushed out of the cache between the first initializing 79*4882a593Smuzhiyun * store and the final stores. 80*4882a593Smuzhiyun * Thus, we use ASI_STBIMRU_P which marks the cache line as 81*4882a593Smuzhiyun * "most recently used" for all but the last store to the cache line. 82*4882a593Smuzhiyun */ 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun#include <asm/asi.h> 85*4882a593Smuzhiyun#include <asm/page.h> 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P 88*4882a593Smuzhiyun#define ASI_STBIMRU_P ASI_ST_BLKINIT_MRU_P 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun#define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ 92*4882a593Smuzhiyun#define MIN_LOOP 16320 93*4882a593Smuzhiyun#define MIN_ZERO 512 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun .section ".text" 96*4882a593Smuzhiyun .align 32 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun/* 99*4882a593Smuzhiyun * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE) 100*4882a593Smuzhiyun * (can create a more optimized version later.) 101*4882a593Smuzhiyun */ 102*4882a593Smuzhiyun .globl M7clear_page 103*4882a593Smuzhiyun .globl M7clear_user_page 104*4882a593SmuzhiyunM7clear_page: /* clear_page(dest) */ 105*4882a593SmuzhiyunM7clear_user_page: 106*4882a593Smuzhiyun set PAGE_SIZE, %o1 107*4882a593Smuzhiyun /* fall through into bzero code */ 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun .size M7clear_page,.-M7clear_page 110*4882a593Smuzhiyun .size M7clear_user_page,.-M7clear_user_page 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun/* 113*4882a593Smuzhiyun * Define bzero(dest, n) as memset(dest, 0, n) 114*4882a593Smuzhiyun * (can create a more optimized version later.) 115*4882a593Smuzhiyun */ 116*4882a593Smuzhiyun .globl M7bzero 117*4882a593SmuzhiyunM7bzero: /* bzero(dest, size) */ 118*4882a593Smuzhiyun mov %o1, %o2 119*4882a593Smuzhiyun mov 0, %o1 120*4882a593Smuzhiyun /* fall through into memset code */ 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun .size M7bzero,.-M7bzero 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun .global M7memset 125*4882a593Smuzhiyun .type M7memset, #function 126*4882a593Smuzhiyun .register %g3, #scratch 127*4882a593SmuzhiyunM7memset: 128*4882a593Smuzhiyun mov %o0, %o5 ! copy sp1 before using it 129*4882a593Smuzhiyun cmp %o2, 7 ! if small counts, just write bytes 130*4882a593Smuzhiyun bleu,pn %xcc, .wrchar 131*4882a593Smuzhiyun and %o1, 0xff, %o1 ! o1 is (char)c 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun sll %o1, 8, %o3 134*4882a593Smuzhiyun or %o1, %o3, %o1 ! now o1 has 2 bytes of c 135*4882a593Smuzhiyun sll %o1, 16, %o3 136*4882a593Smuzhiyun cmp %o2, 32 137*4882a593Smuzhiyun blu,pn %xcc, .wdalign 138*4882a593Smuzhiyun or %o1, %o3, %o1 ! now o1 has 4 bytes of c 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun sllx %o1, 32, %o3 141*4882a593Smuzhiyun or %o1, %o3, %o1 ! now o1 has 8 bytes of c 142*4882a593Smuzhiyun 143*4882a593Smuzhiyun.dbalign: 144*4882a593Smuzhiyun andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound? 145*4882a593Smuzhiyun bz,pt %xcc, .blkalign ! already long word aligned 146*4882a593Smuzhiyun sub %o3, 8, %o3 ! -(bytes till long word aligned) 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun add %o2, %o3, %o2 ! update o2 with new count 149*4882a593Smuzhiyun ! Set -(%o3) bytes till sp1 long word aligned 150*4882a593Smuzhiyun1: stb %o1, [%o5] ! there is at least 1 byte to set 151*4882a593Smuzhiyun inccc %o3 ! byte clearing loop 152*4882a593Smuzhiyun bl,pt %xcc, 1b 153*4882a593Smuzhiyun inc %o5 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun ! Now sp1 is long word aligned (sp1 is found in %o5) 156*4882a593Smuzhiyun.blkalign: 157*4882a593Smuzhiyun cmp %o2, 64 ! check if there are 64 bytes to set 158*4882a593Smuzhiyun blu,pn %xcc, .wrshort 159*4882a593Smuzhiyun mov %o2, %o3 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun andcc %o5, 63, %o3 ! is sp1 block aligned? 162*4882a593Smuzhiyun bz,pt %xcc, .blkwr ! now block aligned 163*4882a593Smuzhiyun sub %o3, 64, %o3 ! o3 is -(bytes till block aligned) 164*4882a593Smuzhiyun add %o2, %o3, %o2 ! o2 is the remainder 165*4882a593Smuzhiyun 166*4882a593Smuzhiyun ! Store -(%o3) bytes till dst is block (64 byte) aligned. 167*4882a593Smuzhiyun ! Use long word stores. 168*4882a593Smuzhiyun ! Recall that dst is already long word aligned 169*4882a593Smuzhiyun1: 170*4882a593Smuzhiyun addcc %o3, 8, %o3 171*4882a593Smuzhiyun stx %o1, [%o5] 172*4882a593Smuzhiyun bl,pt %xcc, 1b 173*4882a593Smuzhiyun add %o5, 8, %o5 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun ! Now sp1 is block aligned 176*4882a593Smuzhiyun.blkwr: 177*4882a593Smuzhiyun andn %o2, 63, %o4 ! calculate size of blocks in bytes 178*4882a593Smuzhiyun brz,pn %o1, .wrzero ! special case if c == 0 179*4882a593Smuzhiyun and %o2, 63, %o3 ! %o3 = bytes left after blk stores. 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun set MIN_LOOP, %g1 182*4882a593Smuzhiyun cmp %o4, %g1 ! check there are enough bytes to set 183*4882a593Smuzhiyun blu,pn %xcc, .short_set ! to justify cost of membar 184*4882a593Smuzhiyun ! must be > pre-cleared lines 185*4882a593Smuzhiyun nop 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun ! initial cache-clearing stores 188*4882a593Smuzhiyun ! get store pipeline moving 189*4882a593Smuzhiyun rd %asi, %g3 ! save %asi to be restored later 190*4882a593Smuzhiyun wr %g0, ASI_STBIMRU_P, %asi 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun ! Primary memset loop for large memsets 193*4882a593Smuzhiyun.wr_loop: 194*4882a593Smuzhiyun sub %o5, 8, %o5 ! adjust %o5 for ASI store alignment 195*4882a593Smuzhiyun mov ST_CHUNK, %g1 196*4882a593Smuzhiyun.wr_loop_start: 197*4882a593Smuzhiyun stxa %o1, [%o5+8]%asi 198*4882a593Smuzhiyun subcc %g1, 4, %g1 199*4882a593Smuzhiyun stxa %o1, [%o5+8+64]%asi 200*4882a593Smuzhiyun add %o5, 256, %o5 201*4882a593Smuzhiyun stxa %o1, [%o5+8-128]%asi 202*4882a593Smuzhiyun bgu %xcc, .wr_loop_start 203*4882a593Smuzhiyun stxa %o1, [%o5+8-64]%asi 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun sub %o5, ST_CHUNK*64, %o5 ! reset %o5 206*4882a593Smuzhiyun mov ST_CHUNK, %g1 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun.wr_loop_rest: 209*4882a593Smuzhiyun stxa %o1, [%o5+8+8]%asi 210*4882a593Smuzhiyun sub %o4, 64, %o4 211*4882a593Smuzhiyun stxa %o1, [%o5+16+8]%asi 212*4882a593Smuzhiyun subcc %g1, 1, %g1 213*4882a593Smuzhiyun stxa %o1, [%o5+24+8]%asi 214*4882a593Smuzhiyun stxa %o1, [%o5+32+8]%asi 215*4882a593Smuzhiyun stxa %o1, [%o5+40+8]%asi 216*4882a593Smuzhiyun add %o5, 64, %o5 217*4882a593Smuzhiyun stxa %o1, [%o5-8]%asi 218*4882a593Smuzhiyun bgu %xcc, .wr_loop_rest 219*4882a593Smuzhiyun stxa %o1, [%o5]ASI_STBI_P 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun ! If more than ST_CHUNK*64 bytes remain to set, continue 222*4882a593Smuzhiyun ! setting the first long word of each cache line in advance 223*4882a593Smuzhiyun ! to keep the store pipeline moving. 224*4882a593Smuzhiyun 225*4882a593Smuzhiyun cmp %o4, ST_CHUNK*64 226*4882a593Smuzhiyun bge,pt %xcc, .wr_loop_start 227*4882a593Smuzhiyun mov ST_CHUNK, %g1 228*4882a593Smuzhiyun 229*4882a593Smuzhiyun brz,a,pn %o4, .asi_done 230*4882a593Smuzhiyun add %o5, 8, %o5 ! restore %o5 offset 231*4882a593Smuzhiyun 232*4882a593Smuzhiyun.wr_loop_small: 233*4882a593Smuzhiyun stxa %o1, [%o5+8]%asi 234*4882a593Smuzhiyun stxa %o1, [%o5+8+8]%asi 235*4882a593Smuzhiyun stxa %o1, [%o5+16+8]%asi 236*4882a593Smuzhiyun stxa %o1, [%o5+24+8]%asi 237*4882a593Smuzhiyun stxa %o1, [%o5+32+8]%asi 238*4882a593Smuzhiyun subcc %o4, 64, %o4 239*4882a593Smuzhiyun stxa %o1, [%o5+40+8]%asi 240*4882a593Smuzhiyun add %o5, 64, %o5 241*4882a593Smuzhiyun stxa %o1, [%o5-8]%asi 242*4882a593Smuzhiyun bgu,pt %xcc, .wr_loop_small 243*4882a593Smuzhiyun stxa %o1, [%o5]ASI_STBI_P 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun ba .asi_done 246*4882a593Smuzhiyun add %o5, 8, %o5 ! restore %o5 offset 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun ! Special case loop for zero fill memsets 249*4882a593Smuzhiyun ! For each 64 byte cache line, single STBI to first element 250*4882a593Smuzhiyun ! clears line 251*4882a593Smuzhiyun.wrzero: 252*4882a593Smuzhiyun cmp %o4, MIN_ZERO ! check if enough bytes to set 253*4882a593Smuzhiyun ! to pay %asi + membar cost 254*4882a593Smuzhiyun blu %xcc, .short_set 255*4882a593Smuzhiyun nop 256*4882a593Smuzhiyun sub %o4, 256, %o4 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun.wrzero_loop: 259*4882a593Smuzhiyun mov 64, %g3 260*4882a593Smuzhiyun stxa %o1, [%o5]ASI_STBI_P 261*4882a593Smuzhiyun subcc %o4, 256, %o4 262*4882a593Smuzhiyun stxa %o1, [%o5+%g3]ASI_STBI_P 263*4882a593Smuzhiyun add %o5, 256, %o5 264*4882a593Smuzhiyun sub %g3, 192, %g3 265*4882a593Smuzhiyun stxa %o1, [%o5+%g3]ASI_STBI_P 266*4882a593Smuzhiyun add %g3, 64, %g3 267*4882a593Smuzhiyun bge,pt %xcc, .wrzero_loop 268*4882a593Smuzhiyun stxa %o1, [%o5+%g3]ASI_STBI_P 269*4882a593Smuzhiyun add %o4, 256, %o4 270*4882a593Smuzhiyun 271*4882a593Smuzhiyun brz,pn %o4, .bsi_done 272*4882a593Smuzhiyun nop 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun.wrzero_small: 275*4882a593Smuzhiyun stxa %o1, [%o5]ASI_STBI_P 276*4882a593Smuzhiyun subcc %o4, 64, %o4 277*4882a593Smuzhiyun bgu,pt %xcc, .wrzero_small 278*4882a593Smuzhiyun add %o5, 64, %o5 279*4882a593Smuzhiyun ba,a .bsi_done 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun.asi_done: 282*4882a593Smuzhiyun wr %g3, 0x0, %asi ! restored saved %asi 283*4882a593Smuzhiyun.bsi_done: 284*4882a593Smuzhiyun membar #StoreStore ! required by use of Block Store Init 285*4882a593Smuzhiyun 286*4882a593Smuzhiyun.short_set: 287*4882a593Smuzhiyun cmp %o4, 64 ! check if 64 bytes to set 288*4882a593Smuzhiyun blu %xcc, 5f 289*4882a593Smuzhiyun nop 290*4882a593Smuzhiyun4: ! set final blocks of 64 bytes 291*4882a593Smuzhiyun stx %o1, [%o5] 292*4882a593Smuzhiyun stx %o1, [%o5+8] 293*4882a593Smuzhiyun stx %o1, [%o5+16] 294*4882a593Smuzhiyun stx %o1, [%o5+24] 295*4882a593Smuzhiyun subcc %o4, 64, %o4 296*4882a593Smuzhiyun stx %o1, [%o5+32] 297*4882a593Smuzhiyun stx %o1, [%o5+40] 298*4882a593Smuzhiyun add %o5, 64, %o5 299*4882a593Smuzhiyun stx %o1, [%o5-16] 300*4882a593Smuzhiyun bgu,pt %xcc, 4b 301*4882a593Smuzhiyun stx %o1, [%o5-8] 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun5: 304*4882a593Smuzhiyun ! Set the remaining long words 305*4882a593Smuzhiyun.wrshort: 306*4882a593Smuzhiyun subcc %o3, 8, %o3 ! Can we store any long words? 307*4882a593Smuzhiyun blu,pn %xcc, .wrchars 308*4882a593Smuzhiyun and %o2, 7, %o2 ! calc bytes left after long words 309*4882a593Smuzhiyun6: 310*4882a593Smuzhiyun subcc %o3, 8, %o3 311*4882a593Smuzhiyun stx %o1, [%o5] ! store the long words 312*4882a593Smuzhiyun bgeu,pt %xcc, 6b 313*4882a593Smuzhiyun add %o5, 8, %o5 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun.wrchars: ! check for extra chars 316*4882a593Smuzhiyun brnz %o2, .wrfin 317*4882a593Smuzhiyun nop 318*4882a593Smuzhiyun retl 319*4882a593Smuzhiyun nop 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun.wdalign: 322*4882a593Smuzhiyun andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary 323*4882a593Smuzhiyun bz,pn %xcc, .wrword 324*4882a593Smuzhiyun andn %o2, 3, %o3 ! create word sized count in %o3 325*4882a593Smuzhiyun 326*4882a593Smuzhiyun dec %o2 ! decrement count 327*4882a593Smuzhiyun stb %o1, [%o5] ! clear a byte 328*4882a593Smuzhiyun b .wdalign 329*4882a593Smuzhiyun inc %o5 ! next byte 330*4882a593Smuzhiyun 331*4882a593Smuzhiyun.wrword: 332*4882a593Smuzhiyun subcc %o3, 4, %o3 333*4882a593Smuzhiyun st %o1, [%o5] ! 4-byte writing loop 334*4882a593Smuzhiyun bnz,pt %xcc, .wrword 335*4882a593Smuzhiyun add %o5, 4, %o5 336*4882a593Smuzhiyun 337*4882a593Smuzhiyun and %o2, 3, %o2 ! leftover count, if any 338*4882a593Smuzhiyun 339*4882a593Smuzhiyun.wrchar: 340*4882a593Smuzhiyun ! Set the remaining bytes, if any 341*4882a593Smuzhiyun brz %o2, .exit 342*4882a593Smuzhiyun nop 343*4882a593Smuzhiyun.wrfin: 344*4882a593Smuzhiyun deccc %o2 345*4882a593Smuzhiyun stb %o1, [%o5] 346*4882a593Smuzhiyun bgu,pt %xcc, .wrfin 347*4882a593Smuzhiyun inc %o5 348*4882a593Smuzhiyun.exit: 349*4882a593Smuzhiyun retl ! %o0 was preserved 350*4882a593Smuzhiyun nop 351*4882a593Smuzhiyun 352*4882a593Smuzhiyun .size M7memset,.-M7memset 353