xref: /OK3568_Linux_fs/kernel/arch/hexagon/lib/memset.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (c) 2011, The Linux Foundation. All rights reserved.
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun/* HEXAGON assembly optimized memset */
8*4882a593Smuzhiyun/* Replaces the standard library function memset */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun        .macro HEXAGON_OPT_FUNC_BEGIN name
12*4882a593Smuzhiyun	.text
13*4882a593Smuzhiyun	.p2align 4
14*4882a593Smuzhiyun	.globl \name
15*4882a593Smuzhiyun	.type  \name, @function
16*4882a593Smuzhiyun\name:
17*4882a593Smuzhiyun	.endm
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun	.macro HEXAGON_OPT_FUNC_FINISH name
20*4882a593Smuzhiyun	.size  \name, . - \name
21*4882a593Smuzhiyun	.endm
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun/* FUNCTION: memset (v2 version) */
24*4882a593Smuzhiyun#if __HEXAGON_ARCH__ < 3
25*4882a593SmuzhiyunHEXAGON_OPT_FUNC_BEGIN memset
26*4882a593Smuzhiyun	{
27*4882a593Smuzhiyun		r6 = #8
28*4882a593Smuzhiyun		r7 = extractu(r0, #3 , #0)
29*4882a593Smuzhiyun		p0 = cmp.eq(r2, #0)
30*4882a593Smuzhiyun		p1 = cmp.gtu(r2, #7)
31*4882a593Smuzhiyun	}
32*4882a593Smuzhiyun	{
33*4882a593Smuzhiyun		r4 = vsplatb(r1)
34*4882a593Smuzhiyun		r8 = r0           /* leave r0 intact for return val  */
35*4882a593Smuzhiyun		r9 = sub(r6, r7)  /* bytes until double alignment  */
36*4882a593Smuzhiyun		if p0 jumpr r31   /* count == 0, so return  */
37*4882a593Smuzhiyun	}
38*4882a593Smuzhiyun	{
39*4882a593Smuzhiyun		r3 = #0
40*4882a593Smuzhiyun		r7 = #0
41*4882a593Smuzhiyun		p0 = tstbit(r9, #0)
42*4882a593Smuzhiyun		if p1 jump 2f /* skip byte loop */
43*4882a593Smuzhiyun	}
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun/* less than 8 bytes to set, so just set a byte at a time and return  */
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun		loop0(1f, r2) /* byte loop */
48*4882a593Smuzhiyun	.falign
49*4882a593Smuzhiyun1: /* byte loop */
50*4882a593Smuzhiyun	{
51*4882a593Smuzhiyun		memb(r8++#1) = r4
52*4882a593Smuzhiyun	}:endloop0
53*4882a593Smuzhiyun		jumpr r31
54*4882a593Smuzhiyun	.falign
55*4882a593Smuzhiyun2: /* skip byte loop */
56*4882a593Smuzhiyun	{
57*4882a593Smuzhiyun		r6 = #1
58*4882a593Smuzhiyun		p0 = tstbit(r9, #1)
59*4882a593Smuzhiyun		p1 = cmp.eq(r2, #1)
60*4882a593Smuzhiyun		if !p0 jump 3f /* skip initial byte store */
61*4882a593Smuzhiyun	}
62*4882a593Smuzhiyun	{
63*4882a593Smuzhiyun		memb(r8++#1) = r4
64*4882a593Smuzhiyun		r3:2 = sub(r3:2, r7:6)
65*4882a593Smuzhiyun		if p1 jumpr r31
66*4882a593Smuzhiyun	}
67*4882a593Smuzhiyun	.falign
68*4882a593Smuzhiyun3: /* skip initial byte store */
69*4882a593Smuzhiyun	{
70*4882a593Smuzhiyun		r6 = #2
71*4882a593Smuzhiyun		p0 = tstbit(r9, #2)
72*4882a593Smuzhiyun		p1 = cmp.eq(r2, #2)
73*4882a593Smuzhiyun		if !p0 jump 4f /* skip initial half store */
74*4882a593Smuzhiyun	}
75*4882a593Smuzhiyun	{
76*4882a593Smuzhiyun		memh(r8++#2) = r4
77*4882a593Smuzhiyun		r3:2 = sub(r3:2, r7:6)
78*4882a593Smuzhiyun		if p1 jumpr r31
79*4882a593Smuzhiyun	}
80*4882a593Smuzhiyun	.falign
81*4882a593Smuzhiyun4: /* skip initial half store */
82*4882a593Smuzhiyun	{
83*4882a593Smuzhiyun		r6 = #4
84*4882a593Smuzhiyun		p0 = cmp.gtu(r2, #7)
85*4882a593Smuzhiyun		p1 = cmp.eq(r2, #4)
86*4882a593Smuzhiyun		if !p0 jump 5f /* skip initial word store */
87*4882a593Smuzhiyun	}
88*4882a593Smuzhiyun	{
89*4882a593Smuzhiyun		memw(r8++#4) = r4
90*4882a593Smuzhiyun		r3:2 = sub(r3:2, r7:6)
91*4882a593Smuzhiyun		p0 = cmp.gtu(r2, #11)
92*4882a593Smuzhiyun		if p1 jumpr r31
93*4882a593Smuzhiyun	}
94*4882a593Smuzhiyun	.falign
95*4882a593Smuzhiyun5: /* skip initial word store */
96*4882a593Smuzhiyun	{
97*4882a593Smuzhiyun		r10 = lsr(r2, #3)
98*4882a593Smuzhiyun		p1 = cmp.eq(r3, #1)
99*4882a593Smuzhiyun		if !p0 jump 7f /* skip double loop */
100*4882a593Smuzhiyun	}
101*4882a593Smuzhiyun	{
102*4882a593Smuzhiyun		r5 = r4
103*4882a593Smuzhiyun		r6 = #8
104*4882a593Smuzhiyun		loop0(6f, r10) /* double loop */
105*4882a593Smuzhiyun	}
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun/* set bytes a double word at a time  */
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun	.falign
110*4882a593Smuzhiyun6: /* double loop */
111*4882a593Smuzhiyun	{
112*4882a593Smuzhiyun		memd(r8++#8) = r5:4
113*4882a593Smuzhiyun		r3:2 = sub(r3:2, r7:6)
114*4882a593Smuzhiyun		p1 = cmp.eq(r2, #8)
115*4882a593Smuzhiyun	}:endloop0
116*4882a593Smuzhiyun	.falign
117*4882a593Smuzhiyun7: /* skip double loop */
118*4882a593Smuzhiyun	{
119*4882a593Smuzhiyun		p0 = tstbit(r2, #2)
120*4882a593Smuzhiyun		if p1 jumpr r31
121*4882a593Smuzhiyun	}
122*4882a593Smuzhiyun	{
123*4882a593Smuzhiyun		r6 = #4
124*4882a593Smuzhiyun		p0 = tstbit(r2, #1)
125*4882a593Smuzhiyun		p1 = cmp.eq(r2, #4)
126*4882a593Smuzhiyun		if !p0 jump 8f /* skip final word store */
127*4882a593Smuzhiyun	}
128*4882a593Smuzhiyun	{
129*4882a593Smuzhiyun		memw(r8++#4) = r4
130*4882a593Smuzhiyun		r3:2 = sub(r3:2, r7:6)
131*4882a593Smuzhiyun		if p1 jumpr r31
132*4882a593Smuzhiyun	}
133*4882a593Smuzhiyun	.falign
134*4882a593Smuzhiyun8: /* skip final word store */
135*4882a593Smuzhiyun	{
136*4882a593Smuzhiyun		p1 = cmp.eq(r2, #2)
137*4882a593Smuzhiyun		if !p0 jump 9f /* skip final half store */
138*4882a593Smuzhiyun	}
139*4882a593Smuzhiyun	{
140*4882a593Smuzhiyun		memh(r8++#2) = r4
141*4882a593Smuzhiyun		if p1 jumpr r31
142*4882a593Smuzhiyun	}
143*4882a593Smuzhiyun	.falign
144*4882a593Smuzhiyun9: /* skip final half store */
145*4882a593Smuzhiyun	{
146*4882a593Smuzhiyun		memb(r8++#1) = r4
147*4882a593Smuzhiyun		jumpr r31
148*4882a593Smuzhiyun	}
149*4882a593SmuzhiyunHEXAGON_OPT_FUNC_FINISH memset
150*4882a593Smuzhiyun#endif
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun/*  FUNCTION: memset (v3 and higher version)  */
154*4882a593Smuzhiyun#if __HEXAGON_ARCH__ >= 3
155*4882a593SmuzhiyunHEXAGON_OPT_FUNC_BEGIN memset
156*4882a593Smuzhiyun	{
157*4882a593Smuzhiyun		r7=vsplatb(r1)
158*4882a593Smuzhiyun		r6 = r0
159*4882a593Smuzhiyun		if (r2==#0) jump:nt .L1
160*4882a593Smuzhiyun	}
161*4882a593Smuzhiyun	{
162*4882a593Smuzhiyun		r5:4=combine(r7,r7)
163*4882a593Smuzhiyun		p0 = cmp.gtu(r2,#8)
164*4882a593Smuzhiyun		if (p0.new) jump:nt .L3
165*4882a593Smuzhiyun	}
166*4882a593Smuzhiyun	{
167*4882a593Smuzhiyun		r3 = r0
168*4882a593Smuzhiyun		loop0(.L47,r2)
169*4882a593Smuzhiyun	}
170*4882a593Smuzhiyun	.falign
171*4882a593Smuzhiyun.L47:
172*4882a593Smuzhiyun	{
173*4882a593Smuzhiyun		memb(r3++#1) = r1
174*4882a593Smuzhiyun	}:endloop0 /* start=.L47 */
175*4882a593Smuzhiyun		jumpr r31
176*4882a593Smuzhiyun.L3:
177*4882a593Smuzhiyun	{
178*4882a593Smuzhiyun		p0 = tstbit(r0,#0)
179*4882a593Smuzhiyun		if (!p0.new) jump:nt .L8
180*4882a593Smuzhiyun		p1 = cmp.eq(r2, #1)
181*4882a593Smuzhiyun	}
182*4882a593Smuzhiyun	{
183*4882a593Smuzhiyun		r6 = add(r0, #1)
184*4882a593Smuzhiyun		r2 = add(r2,#-1)
185*4882a593Smuzhiyun		memb(r0) = r1
186*4882a593Smuzhiyun		if (p1) jump .L1
187*4882a593Smuzhiyun	}
188*4882a593Smuzhiyun.L8:
189*4882a593Smuzhiyun	{
190*4882a593Smuzhiyun		p0 = tstbit(r6,#1)
191*4882a593Smuzhiyun		if (!p0.new) jump:nt .L10
192*4882a593Smuzhiyun	}
193*4882a593Smuzhiyun	{
194*4882a593Smuzhiyun		r2 = add(r2,#-2)
195*4882a593Smuzhiyun		memh(r6++#2) = r7
196*4882a593Smuzhiyun		p0 = cmp.eq(r2, #2)
197*4882a593Smuzhiyun		if (p0.new) jump:nt .L1
198*4882a593Smuzhiyun	}
199*4882a593Smuzhiyun.L10:
200*4882a593Smuzhiyun	{
201*4882a593Smuzhiyun		p0 = tstbit(r6,#2)
202*4882a593Smuzhiyun		if (!p0.new) jump:nt .L12
203*4882a593Smuzhiyun	}
204*4882a593Smuzhiyun	{
205*4882a593Smuzhiyun		r2 = add(r2,#-4)
206*4882a593Smuzhiyun		memw(r6++#4) = r7
207*4882a593Smuzhiyun		p0 = cmp.eq(r2, #4)
208*4882a593Smuzhiyun		if (p0.new) jump:nt .L1
209*4882a593Smuzhiyun	}
210*4882a593Smuzhiyun.L12:
211*4882a593Smuzhiyun	{
212*4882a593Smuzhiyun		p0 = cmp.gtu(r2,#127)
213*4882a593Smuzhiyun		if (!p0.new) jump:nt .L14
214*4882a593Smuzhiyun	}
215*4882a593Smuzhiyun		r3 = and(r6,#31)
216*4882a593Smuzhiyun		if (r3==#0) jump:nt .L17
217*4882a593Smuzhiyun	{
218*4882a593Smuzhiyun		memd(r6++#8) = r5:4
219*4882a593Smuzhiyun		r2 = add(r2,#-8)
220*4882a593Smuzhiyun	}
221*4882a593Smuzhiyun		r3 = and(r6,#31)
222*4882a593Smuzhiyun		if (r3==#0) jump:nt .L17
223*4882a593Smuzhiyun	{
224*4882a593Smuzhiyun		memd(r6++#8) = r5:4
225*4882a593Smuzhiyun		r2 = add(r2,#-8)
226*4882a593Smuzhiyun	}
227*4882a593Smuzhiyun		r3 = and(r6,#31)
228*4882a593Smuzhiyun		if (r3==#0) jump:nt .L17
229*4882a593Smuzhiyun	{
230*4882a593Smuzhiyun		memd(r6++#8) = r5:4
231*4882a593Smuzhiyun		r2 = add(r2,#-8)
232*4882a593Smuzhiyun	}
233*4882a593Smuzhiyun.L17:
234*4882a593Smuzhiyun	{
235*4882a593Smuzhiyun		r3 = lsr(r2,#5)
236*4882a593Smuzhiyun		if (r1!=#0) jump:nt .L18
237*4882a593Smuzhiyun	}
238*4882a593Smuzhiyun	{
239*4882a593Smuzhiyun		r8 = r3
240*4882a593Smuzhiyun		r3 = r6
241*4882a593Smuzhiyun		loop0(.L46,r3)
242*4882a593Smuzhiyun	}
243*4882a593Smuzhiyun	.falign
244*4882a593Smuzhiyun.L46:
245*4882a593Smuzhiyun	{
246*4882a593Smuzhiyun		dczeroa(r6)
247*4882a593Smuzhiyun		r6 = add(r6,#32)
248*4882a593Smuzhiyun		r2 = add(r2,#-32)
249*4882a593Smuzhiyun	}:endloop0 /* start=.L46 */
250*4882a593Smuzhiyun.L14:
251*4882a593Smuzhiyun	{
252*4882a593Smuzhiyun		p0 = cmp.gtu(r2,#7)
253*4882a593Smuzhiyun		if (!p0.new) jump:nt .L28
254*4882a593Smuzhiyun		r8 = lsr(r2,#3)
255*4882a593Smuzhiyun	}
256*4882a593Smuzhiyun		loop0(.L44,r8)
257*4882a593Smuzhiyun	.falign
258*4882a593Smuzhiyun.L44:
259*4882a593Smuzhiyun	{
260*4882a593Smuzhiyun		memd(r6++#8) = r5:4
261*4882a593Smuzhiyun		r2 = add(r2,#-8)
262*4882a593Smuzhiyun	}:endloop0 /* start=.L44 */
263*4882a593Smuzhiyun.L28:
264*4882a593Smuzhiyun	{
265*4882a593Smuzhiyun		p0 = tstbit(r2,#2)
266*4882a593Smuzhiyun		if (!p0.new) jump:nt .L33
267*4882a593Smuzhiyun	}
268*4882a593Smuzhiyun	{
269*4882a593Smuzhiyun		r2 = add(r2,#-4)
270*4882a593Smuzhiyun		memw(r6++#4) = r7
271*4882a593Smuzhiyun	}
272*4882a593Smuzhiyun.L33:
273*4882a593Smuzhiyun	{
274*4882a593Smuzhiyun		p0 = tstbit(r2,#1)
275*4882a593Smuzhiyun		if (!p0.new) jump:nt .L35
276*4882a593Smuzhiyun	}
277*4882a593Smuzhiyun	{
278*4882a593Smuzhiyun		r2 = add(r2,#-2)
279*4882a593Smuzhiyun		memh(r6++#2) = r7
280*4882a593Smuzhiyun	}
281*4882a593Smuzhiyun.L35:
282*4882a593Smuzhiyun		p0 = cmp.eq(r2,#1)
283*4882a593Smuzhiyun		if (p0) memb(r6) = r1
284*4882a593Smuzhiyun.L1:
285*4882a593Smuzhiyun		jumpr r31
286*4882a593Smuzhiyun.L18:
287*4882a593Smuzhiyun		loop0(.L45,r3)
288*4882a593Smuzhiyun	.falign
289*4882a593Smuzhiyun.L45:
290*4882a593Smuzhiyun		dczeroa(r6)
291*4882a593Smuzhiyun	{
292*4882a593Smuzhiyun		memd(r6++#8) = r5:4
293*4882a593Smuzhiyun		r2 = add(r2,#-32)
294*4882a593Smuzhiyun	}
295*4882a593Smuzhiyun		memd(r6++#8) = r5:4
296*4882a593Smuzhiyun		memd(r6++#8) = r5:4
297*4882a593Smuzhiyun	{
298*4882a593Smuzhiyun		memd(r6++#8) = r5:4
299*4882a593Smuzhiyun	}:endloop0 /* start=.L45  */
300*4882a593Smuzhiyun		jump .L14
301*4882a593SmuzhiyunHEXAGON_OPT_FUNC_FINISH memset
302*4882a593Smuzhiyun#endif
303