xref: /OK3568_Linux_fs/kernel/arch/arc/lib/memset-archs.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun#include <linux/linkage.h>
7*4882a593Smuzhiyun#include <asm/cache.h>
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun/*
10*4882a593Smuzhiyun * The memset implementation below is optimized to use prefetchw and prealloc
11*4882a593Smuzhiyun * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
12*4882a593Smuzhiyun * If you want to implement optimized memset for other possible L1 data cache
13*4882a593Smuzhiyun * line lengths (32B and 128B) you should rewrite code carefully checking
14*4882a593Smuzhiyun * we don't call any prefetchw/prealloc instruction for L1 cache lines which
15*4882a593Smuzhiyun * don't belongs to memset area.
16*4882a593Smuzhiyun */
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun#if L1_CACHE_SHIFT == 6
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun.macro PREALLOC_INSTR	reg, off
21*4882a593Smuzhiyun	prealloc	[\reg, \off]
22*4882a593Smuzhiyun.endm
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun.macro PREFETCHW_INSTR	reg, off
25*4882a593Smuzhiyun	prefetchw	[\reg, \off]
26*4882a593Smuzhiyun.endm
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#else
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun.macro PREALLOC_INSTR	reg, off
31*4882a593Smuzhiyun.endm
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun.macro PREFETCHW_INSTR	reg, off
34*4882a593Smuzhiyun.endm
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun#endif
37*4882a593Smuzhiyun
38*4882a593SmuzhiyunENTRY_CFI(memset)
39*4882a593Smuzhiyun	PREFETCHW_INSTR	r0, 0	; Prefetch the first write location
40*4882a593Smuzhiyun	mov.f	0, r2
41*4882a593Smuzhiyun;;; if size is zero
42*4882a593Smuzhiyun	jz.d	[blink]
43*4882a593Smuzhiyun	mov	r3, r0		; don't clobber ret val
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun;;; if length < 8
46*4882a593Smuzhiyun	brls.d.nt	r2, 8, .Lsmallchunk
47*4882a593Smuzhiyun	mov.f	lp_count,r2
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun	and.f	r4, r0, 0x03
50*4882a593Smuzhiyun	rsub	lp_count, r4, 4
51*4882a593Smuzhiyun	lpnz	@.Laligndestination
52*4882a593Smuzhiyun	;; LOOP BEGIN
53*4882a593Smuzhiyun	stb.ab	r1, [r3,1]
54*4882a593Smuzhiyun	sub	r2, r2, 1
55*4882a593Smuzhiyun.Laligndestination:
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun;;; Destination is aligned
58*4882a593Smuzhiyun	and	r1, r1, 0xFF
59*4882a593Smuzhiyun	asl	r4, r1, 8
60*4882a593Smuzhiyun	or	r4, r4, r1
61*4882a593Smuzhiyun	asl	r5, r4, 16
62*4882a593Smuzhiyun	or	r5, r5, r4
63*4882a593Smuzhiyun	mov	r4, r5
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun	sub3	lp_count, r2, 8
66*4882a593Smuzhiyun	cmp     r2, 64
67*4882a593Smuzhiyun	bmsk.hi	r2, r2, 5
68*4882a593Smuzhiyun	mov.ls	lp_count, 0
69*4882a593Smuzhiyun	add3.hi	r2, r2, 8
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun;;; Convert len to Dwords, unfold x8
72*4882a593Smuzhiyun	lsr.f	lp_count, lp_count, 6
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun	lpnz	@.Lset64bytes
75*4882a593Smuzhiyun	;; LOOP START
76*4882a593Smuzhiyun	PREALLOC_INSTR	r3, 64	; alloc next line w/o fetching
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun#ifdef CONFIG_ARC_HAS_LL64
79*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
80*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
81*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
82*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
83*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
84*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
85*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
86*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
87*4882a593Smuzhiyun#else
88*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
89*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
90*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
91*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
92*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
93*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
94*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
95*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
96*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
97*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
98*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
99*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
100*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
101*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
102*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
103*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
104*4882a593Smuzhiyun#endif
105*4882a593Smuzhiyun.Lset64bytes:
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
108*4882a593Smuzhiyun	lpnz	.Lset32bytes
109*4882a593Smuzhiyun	;; LOOP START
110*4882a593Smuzhiyun#ifdef CONFIG_ARC_HAS_LL64
111*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
112*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
113*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
114*4882a593Smuzhiyun	std.ab	r4, [r3, 8]
115*4882a593Smuzhiyun#else
116*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
117*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
118*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
119*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
120*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
121*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
122*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
123*4882a593Smuzhiyun	st.ab	r4, [r3, 4]
124*4882a593Smuzhiyun#endif
125*4882a593Smuzhiyun.Lset32bytes:
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
128*4882a593Smuzhiyun.Lsmallchunk:
129*4882a593Smuzhiyun	lpnz	.Lcopy3bytes
130*4882a593Smuzhiyun	;; LOOP START
131*4882a593Smuzhiyun	stb.ab	r1, [r3, 1]
132*4882a593Smuzhiyun.Lcopy3bytes:
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun	j	[blink]
135*4882a593Smuzhiyun
136*4882a593SmuzhiyunEND_CFI(memset)
137*4882a593Smuzhiyun
138*4882a593SmuzhiyunENTRY_CFI(memzero)
139*4882a593Smuzhiyun    ; adjust bzero args to memset args
140*4882a593Smuzhiyun    mov r2, r1
141*4882a593Smuzhiyun    b.d  memset    ;tail call so need to tinker with blink
142*4882a593Smuzhiyun    mov r1, 0
143*4882a593SmuzhiyunEND_CFI(memzero)
144