1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) 4*4882a593Smuzhiyun */ 5*4882a593Smuzhiyun 6*4882a593Smuzhiyun#include <linux/linkage.h> 7*4882a593Smuzhiyun#include <asm/cache.h> 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun/* 10*4882a593Smuzhiyun * The memset implementation below is optimized to use prefetchw and prealloc 11*4882a593Smuzhiyun * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6) 12*4882a593Smuzhiyun * If you want to implement optimized memset for other possible L1 data cache 13*4882a593Smuzhiyun * line lengths (32B and 128B) you should rewrite code carefully checking 14*4882a593Smuzhiyun * we don't call any prefetchw/prealloc instruction for L1 cache lines which 15*4882a593Smuzhiyun * don't belongs to memset area. 16*4882a593Smuzhiyun */ 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun#if L1_CACHE_SHIFT == 6 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun.macro PREALLOC_INSTR reg, off 21*4882a593Smuzhiyun prealloc [\reg, \off] 22*4882a593Smuzhiyun.endm 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun.macro PREFETCHW_INSTR reg, off 25*4882a593Smuzhiyun prefetchw [\reg, \off] 26*4882a593Smuzhiyun.endm 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#else 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun.macro PREALLOC_INSTR reg, off 31*4882a593Smuzhiyun.endm 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun.macro PREFETCHW_INSTR reg, off 34*4882a593Smuzhiyun.endm 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun#endif 37*4882a593Smuzhiyun 38*4882a593SmuzhiyunENTRY_CFI(memset) 39*4882a593Smuzhiyun PREFETCHW_INSTR r0, 0 ; Prefetch the first write location 40*4882a593Smuzhiyun mov.f 0, r2 41*4882a593Smuzhiyun;;; if size is zero 42*4882a593Smuzhiyun jz.d [blink] 43*4882a593Smuzhiyun mov r3, r0 ; don't clobber ret val 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun;;; if length < 8 46*4882a593Smuzhiyun brls.d.nt r2, 8, .Lsmallchunk 47*4882a593Smuzhiyun mov.f lp_count,r2 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun and.f r4, r0, 0x03 50*4882a593Smuzhiyun rsub lp_count, r4, 4 51*4882a593Smuzhiyun lpnz @.Laligndestination 52*4882a593Smuzhiyun ;; LOOP BEGIN 53*4882a593Smuzhiyun stb.ab r1, [r3,1] 54*4882a593Smuzhiyun sub r2, r2, 1 55*4882a593Smuzhiyun.Laligndestination: 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun;;; Destination is aligned 58*4882a593Smuzhiyun and r1, r1, 0xFF 59*4882a593Smuzhiyun asl r4, r1, 8 60*4882a593Smuzhiyun or r4, r4, r1 61*4882a593Smuzhiyun asl r5, r4, 16 62*4882a593Smuzhiyun or r5, r5, r4 63*4882a593Smuzhiyun mov r4, r5 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun sub3 lp_count, r2, 8 66*4882a593Smuzhiyun cmp r2, 64 67*4882a593Smuzhiyun bmsk.hi r2, r2, 5 68*4882a593Smuzhiyun mov.ls lp_count, 0 69*4882a593Smuzhiyun add3.hi r2, r2, 8 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun;;; Convert len to Dwords, unfold x8 72*4882a593Smuzhiyun lsr.f lp_count, lp_count, 6 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun lpnz @.Lset64bytes 75*4882a593Smuzhiyun ;; LOOP START 76*4882a593Smuzhiyun PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun#ifdef CONFIG_ARC_HAS_LL64 79*4882a593Smuzhiyun std.ab r4, [r3, 8] 80*4882a593Smuzhiyun std.ab r4, [r3, 8] 81*4882a593Smuzhiyun std.ab r4, [r3, 8] 82*4882a593Smuzhiyun std.ab r4, [r3, 8] 83*4882a593Smuzhiyun std.ab r4, [r3, 8] 84*4882a593Smuzhiyun std.ab r4, [r3, 8] 85*4882a593Smuzhiyun std.ab r4, [r3, 8] 86*4882a593Smuzhiyun std.ab r4, [r3, 8] 87*4882a593Smuzhiyun#else 88*4882a593Smuzhiyun st.ab r4, [r3, 4] 89*4882a593Smuzhiyun st.ab r4, [r3, 4] 90*4882a593Smuzhiyun st.ab r4, [r3, 4] 91*4882a593Smuzhiyun st.ab r4, [r3, 4] 92*4882a593Smuzhiyun st.ab r4, [r3, 4] 93*4882a593Smuzhiyun st.ab r4, [r3, 4] 94*4882a593Smuzhiyun st.ab r4, [r3, 4] 95*4882a593Smuzhiyun st.ab r4, [r3, 4] 96*4882a593Smuzhiyun st.ab r4, [r3, 4] 97*4882a593Smuzhiyun st.ab r4, [r3, 4] 98*4882a593Smuzhiyun st.ab r4, [r3, 4] 99*4882a593Smuzhiyun st.ab r4, [r3, 4] 100*4882a593Smuzhiyun st.ab r4, [r3, 4] 101*4882a593Smuzhiyun st.ab r4, [r3, 4] 102*4882a593Smuzhiyun st.ab r4, [r3, 4] 103*4882a593Smuzhiyun st.ab r4, [r3, 4] 104*4882a593Smuzhiyun#endif 105*4882a593Smuzhiyun.Lset64bytes: 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes 108*4882a593Smuzhiyun lpnz .Lset32bytes 109*4882a593Smuzhiyun ;; LOOP START 110*4882a593Smuzhiyun#ifdef CONFIG_ARC_HAS_LL64 111*4882a593Smuzhiyun std.ab r4, [r3, 8] 112*4882a593Smuzhiyun std.ab r4, [r3, 8] 113*4882a593Smuzhiyun std.ab r4, [r3, 8] 114*4882a593Smuzhiyun std.ab r4, [r3, 8] 115*4882a593Smuzhiyun#else 116*4882a593Smuzhiyun st.ab r4, [r3, 4] 117*4882a593Smuzhiyun st.ab r4, [r3, 4] 118*4882a593Smuzhiyun st.ab r4, [r3, 4] 119*4882a593Smuzhiyun st.ab r4, [r3, 4] 120*4882a593Smuzhiyun st.ab r4, [r3, 4] 121*4882a593Smuzhiyun st.ab r4, [r3, 4] 122*4882a593Smuzhiyun st.ab r4, [r3, 4] 123*4882a593Smuzhiyun st.ab r4, [r3, 4] 124*4882a593Smuzhiyun#endif 125*4882a593Smuzhiyun.Lset32bytes: 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun and.f lp_count, r2, 0x1F ;Last remaining 31 bytes 128*4882a593Smuzhiyun.Lsmallchunk: 129*4882a593Smuzhiyun lpnz .Lcopy3bytes 130*4882a593Smuzhiyun ;; LOOP START 131*4882a593Smuzhiyun stb.ab r1, [r3, 1] 132*4882a593Smuzhiyun.Lcopy3bytes: 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun j [blink] 135*4882a593Smuzhiyun 136*4882a593SmuzhiyunEND_CFI(memset) 137*4882a593Smuzhiyun 138*4882a593SmuzhiyunENTRY_CFI(memzero) 139*4882a593Smuzhiyun ; adjust bzero args to memset args 140*4882a593Smuzhiyun mov r2, r1 141*4882a593Smuzhiyun b.d memset ;tail call so need to tinker with blink 142*4882a593Smuzhiyun mov r1, 0 143*4882a593SmuzhiyunEND_CFI(memzero) 144