xref: /OK3568_Linux_fs/kernel/arch/arm64/lib/memset.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd.
4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro
7*4882a593Smuzhiyun * be found @
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10*4882a593Smuzhiyun * files/head:/src/aarch64/
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#include <linux/linkage.h>
14*4882a593Smuzhiyun#include <asm/assembler.h>
15*4882a593Smuzhiyun#include <asm/cache.h>
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun/*
18*4882a593Smuzhiyun * Fill in the buffer with character c (alignment handled by the hardware)
19*4882a593Smuzhiyun *
20*4882a593Smuzhiyun * Parameters:
21*4882a593Smuzhiyun *	x0 - buf
22*4882a593Smuzhiyun *	x1 - c
23*4882a593Smuzhiyun *	x2 - n
24*4882a593Smuzhiyun * Returns:
25*4882a593Smuzhiyun *	x0 - buf
26*4882a593Smuzhiyun */
27*4882a593Smuzhiyun
28*4882a593Smuzhiyundstin		.req	x0
29*4882a593Smuzhiyunval		.req	w1
30*4882a593Smuzhiyuncount		.req	x2
31*4882a593Smuzhiyuntmp1		.req	x3
32*4882a593Smuzhiyuntmp1w		.req	w3
33*4882a593Smuzhiyuntmp2		.req	x4
34*4882a593Smuzhiyuntmp2w		.req	w4
35*4882a593Smuzhiyunzva_len_x	.req	x5
36*4882a593Smuzhiyunzva_len		.req	w5
37*4882a593Smuzhiyunzva_bits_x	.req	x6
38*4882a593Smuzhiyun
39*4882a593SmuzhiyunA_l		.req	x7
40*4882a593SmuzhiyunA_lw		.req	w7
41*4882a593Smuzhiyundst		.req	x8
42*4882a593Smuzhiyuntmp3w		.req	w9
43*4882a593Smuzhiyuntmp3		.req	x9
44*4882a593Smuzhiyun
45*4882a593SmuzhiyunSYM_FUNC_START_ALIAS(__memset)
46*4882a593SmuzhiyunSYM_FUNC_START_WEAK_PI(memset)
47*4882a593Smuzhiyun	mov	dst, dstin	/* Preserve return value.  */
48*4882a593Smuzhiyun	and	A_lw, val, #255
49*4882a593Smuzhiyun	orr	A_lw, A_lw, A_lw, lsl #8
50*4882a593Smuzhiyun	orr	A_lw, A_lw, A_lw, lsl #16
51*4882a593Smuzhiyun	orr	A_l, A_l, A_l, lsl #32
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun	cmp	count, #15
54*4882a593Smuzhiyun	b.hi	.Lover16_proc
55*4882a593Smuzhiyun	/*All store maybe are non-aligned..*/
56*4882a593Smuzhiyun	tbz	count, #3, 1f
57*4882a593Smuzhiyun	str	A_l, [dst], #8
58*4882a593Smuzhiyun1:
59*4882a593Smuzhiyun	tbz	count, #2, 2f
60*4882a593Smuzhiyun	str	A_lw, [dst], #4
61*4882a593Smuzhiyun2:
62*4882a593Smuzhiyun	tbz	count, #1, 3f
63*4882a593Smuzhiyun	strh	A_lw, [dst], #2
64*4882a593Smuzhiyun3:
65*4882a593Smuzhiyun	tbz	count, #0, 4f
66*4882a593Smuzhiyun	strb	A_lw, [dst]
67*4882a593Smuzhiyun4:
68*4882a593Smuzhiyun	ret
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun.Lover16_proc:
71*4882a593Smuzhiyun	/*Whether  the start address is aligned with 16.*/
72*4882a593Smuzhiyun	neg	tmp2, dst
73*4882a593Smuzhiyun	ands	tmp2, tmp2, #15
74*4882a593Smuzhiyun	b.eq	.Laligned
75*4882a593Smuzhiyun/*
76*4882a593Smuzhiyun* The count is not less than 16, we can use stp to store the start 16 bytes,
77*4882a593Smuzhiyun* then adjust the dst aligned with 16.This process will make the current
78*4882a593Smuzhiyun* memory address at alignment boundary.
79*4882a593Smuzhiyun*/
80*4882a593Smuzhiyun	stp	A_l, A_l, [dst] /*non-aligned store..*/
81*4882a593Smuzhiyun	/*make the dst aligned..*/
82*4882a593Smuzhiyun	sub	count, count, tmp2
83*4882a593Smuzhiyun	add	dst, dst, tmp2
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun.Laligned:
86*4882a593Smuzhiyun	cbz	A_l, .Lzero_mem
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun.Ltail_maybe_long:
89*4882a593Smuzhiyun	cmp	count, #64
90*4882a593Smuzhiyun	b.ge	.Lnot_short
91*4882a593Smuzhiyun.Ltail63:
92*4882a593Smuzhiyun	ands	tmp1, count, #0x30
93*4882a593Smuzhiyun	b.eq	3f
94*4882a593Smuzhiyun	cmp	tmp1w, #0x20
95*4882a593Smuzhiyun	b.eq	1f
96*4882a593Smuzhiyun	b.lt	2f
97*4882a593Smuzhiyun	stp	A_l, A_l, [dst], #16
98*4882a593Smuzhiyun1:
99*4882a593Smuzhiyun	stp	A_l, A_l, [dst], #16
100*4882a593Smuzhiyun2:
101*4882a593Smuzhiyun	stp	A_l, A_l, [dst], #16
102*4882a593Smuzhiyun/*
103*4882a593Smuzhiyun* The last store length is less than 16,use stp to write last 16 bytes.
104*4882a593Smuzhiyun* It will lead some bytes written twice and the access is non-aligned.
105*4882a593Smuzhiyun*/
106*4882a593Smuzhiyun3:
107*4882a593Smuzhiyun	ands	count, count, #15
108*4882a593Smuzhiyun	cbz	count, 4f
109*4882a593Smuzhiyun	add	dst, dst, count
110*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
111*4882a593Smuzhiyun4:
112*4882a593Smuzhiyun	ret
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun	/*
115*4882a593Smuzhiyun	* Critical loop. Start at a new cache line boundary. Assuming
116*4882a593Smuzhiyun	* 64 bytes per line, this ensures the entire loop is in one line.
117*4882a593Smuzhiyun	*/
118*4882a593Smuzhiyun	.p2align	L1_CACHE_SHIFT
119*4882a593Smuzhiyun.Lnot_short:
120*4882a593Smuzhiyun	sub	dst, dst, #16/* Pre-bias.  */
121*4882a593Smuzhiyun	sub	count, count, #64
122*4882a593Smuzhiyun1:
123*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #16]
124*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #32]
125*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #48]
126*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #64]!
127*4882a593Smuzhiyun	subs	count, count, #64
128*4882a593Smuzhiyun	b.ge	1b
129*4882a593Smuzhiyun	tst	count, #0x3f
130*4882a593Smuzhiyun	add	dst, dst, #16
131*4882a593Smuzhiyun	b.ne	.Ltail63
132*4882a593Smuzhiyun.Lexitfunc:
133*4882a593Smuzhiyun	ret
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun	/*
136*4882a593Smuzhiyun	* For zeroing memory, check to see if we can use the ZVA feature to
137*4882a593Smuzhiyun	* zero entire 'cache' lines.
138*4882a593Smuzhiyun	*/
139*4882a593Smuzhiyun.Lzero_mem:
140*4882a593Smuzhiyun	cmp	count, #63
141*4882a593Smuzhiyun	b.le	.Ltail63
142*4882a593Smuzhiyun	/*
143*4882a593Smuzhiyun	* For zeroing small amounts of memory, it's not worth setting up
144*4882a593Smuzhiyun	* the line-clear code.
145*4882a593Smuzhiyun	*/
146*4882a593Smuzhiyun	cmp	count, #128
147*4882a593Smuzhiyun	b.lt	.Lnot_short /*count is at least  128 bytes*/
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun	mrs	tmp1, dczid_el0
150*4882a593Smuzhiyun	tbnz	tmp1, #4, .Lnot_short
151*4882a593Smuzhiyun	mov	tmp3w, #4
152*4882a593Smuzhiyun	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
153*4882a593Smuzhiyun	lsl	zva_len, tmp3w, zva_len
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun	ands	tmp3w, zva_len, #63
156*4882a593Smuzhiyun	/*
157*4882a593Smuzhiyun	* ensure the zva_len is not less than 64.
158*4882a593Smuzhiyun	* It is not meaningful to use ZVA if the block size is less than 64.
159*4882a593Smuzhiyun	*/
160*4882a593Smuzhiyun	b.ne	.Lnot_short
161*4882a593Smuzhiyun.Lzero_by_line:
162*4882a593Smuzhiyun	/*
163*4882a593Smuzhiyun	* Compute how far we need to go to become suitably aligned. We're
164*4882a593Smuzhiyun	* already at quad-word alignment.
165*4882a593Smuzhiyun	*/
166*4882a593Smuzhiyun	cmp	count, zva_len_x
167*4882a593Smuzhiyun	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
168*4882a593Smuzhiyun	sub	zva_bits_x, zva_len_x, #1
169*4882a593Smuzhiyun	neg	tmp2, dst
170*4882a593Smuzhiyun	ands	tmp2, tmp2, zva_bits_x
171*4882a593Smuzhiyun	b.eq	2f			/* Already aligned.  */
172*4882a593Smuzhiyun	/* Not aligned, check that there's enough to copy after alignment.*/
173*4882a593Smuzhiyun	sub	tmp1, count, tmp2
174*4882a593Smuzhiyun	/*
175*4882a593Smuzhiyun	* grantee the remain length to be ZVA is bigger than 64,
176*4882a593Smuzhiyun	* avoid to make the 2f's process over mem range.*/
177*4882a593Smuzhiyun	cmp	tmp1, #64
178*4882a593Smuzhiyun	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
179*4882a593Smuzhiyun	b.lt	.Lnot_short
180*4882a593Smuzhiyun	/*
181*4882a593Smuzhiyun	* We know that there's at least 64 bytes to zero and that it's safe
182*4882a593Smuzhiyun	* to overrun by 64 bytes.
183*4882a593Smuzhiyun	*/
184*4882a593Smuzhiyun	mov	count, tmp1
185*4882a593Smuzhiyun1:
186*4882a593Smuzhiyun	stp	A_l, A_l, [dst]
187*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #16]
188*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #32]
189*4882a593Smuzhiyun	subs	tmp2, tmp2, #64
190*4882a593Smuzhiyun	stp	A_l, A_l, [dst, #48]
191*4882a593Smuzhiyun	add	dst, dst, #64
192*4882a593Smuzhiyun	b.ge	1b
193*4882a593Smuzhiyun	/* We've overrun a bit, so adjust dst downwards.*/
194*4882a593Smuzhiyun	add	dst, dst, tmp2
195*4882a593Smuzhiyun2:
196*4882a593Smuzhiyun	sub	count, count, zva_len_x
197*4882a593Smuzhiyun3:
198*4882a593Smuzhiyun	dc	zva, dst
199*4882a593Smuzhiyun	add	dst, dst, zva_len_x
200*4882a593Smuzhiyun	subs	count, count, zva_len_x
201*4882a593Smuzhiyun	b.ge	3b
202*4882a593Smuzhiyun	ands	count, count, zva_bits_x
203*4882a593Smuzhiyun	b.ne	.Ltail_maybe_long
204*4882a593Smuzhiyun	ret
205*4882a593SmuzhiyunSYM_FUNC_END_PI(memset)
206*4882a593SmuzhiyunEXPORT_SYMBOL(memset)
207*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(__memset)
208*4882a593SmuzhiyunEXPORT_SYMBOL(__memset)
209