xref: /OK3568_Linux_fs/kernel/arch/arm64/lib/copy_template.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd.
4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro
7*4882a593Smuzhiyun * be found @
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10*4882a593Smuzhiyun * files/head:/src/aarch64/
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun/*
15*4882a593Smuzhiyun * Copy a buffer from src to dest (alignment handled by the hardware)
16*4882a593Smuzhiyun *
17*4882a593Smuzhiyun * Parameters:
18*4882a593Smuzhiyun *	x0 - dest
19*4882a593Smuzhiyun *	x1 - src
20*4882a593Smuzhiyun *	x2 - n
21*4882a593Smuzhiyun * Returns:
22*4882a593Smuzhiyun *	x0 - dest
23*4882a593Smuzhiyun */
24*4882a593Smuzhiyundstin	.req	x0
25*4882a593Smuzhiyunsrc	.req	x1
26*4882a593Smuzhiyuncount	.req	x2
27*4882a593Smuzhiyuntmp1	.req	x3
28*4882a593Smuzhiyuntmp1w	.req	w3
29*4882a593Smuzhiyuntmp2	.req	x4
30*4882a593Smuzhiyuntmp2w	.req	w4
31*4882a593Smuzhiyundst	.req	x6
32*4882a593Smuzhiyun
33*4882a593SmuzhiyunA_l	.req	x7
34*4882a593SmuzhiyunA_h	.req	x8
35*4882a593SmuzhiyunB_l	.req	x9
36*4882a593SmuzhiyunB_h	.req	x10
37*4882a593SmuzhiyunC_l	.req	x11
38*4882a593SmuzhiyunC_h	.req	x12
39*4882a593SmuzhiyunD_l	.req	x13
40*4882a593SmuzhiyunD_h	.req	x14
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	mov	dst, dstin
43*4882a593Smuzhiyun	cmp	count, #16
44*4882a593Smuzhiyun	/*When memory length is less than 16, the accessed are not aligned.*/
45*4882a593Smuzhiyun	b.lo	.Ltiny15
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun	neg	tmp2, src
48*4882a593Smuzhiyun	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
49*4882a593Smuzhiyun	b.eq	.LSrcAligned
50*4882a593Smuzhiyun	sub	count, count, tmp2
51*4882a593Smuzhiyun	/*
52*4882a593Smuzhiyun	* Copy the leading memory data from src to dst in an increasing
53*4882a593Smuzhiyun	* address order.By this way,the risk of overwriting the source
54*4882a593Smuzhiyun	* memory data is eliminated when the distance between src and
55*4882a593Smuzhiyun	* dst is less than 16. The memory accesses here are alignment.
56*4882a593Smuzhiyun	*/
57*4882a593Smuzhiyun	tbz	tmp2, #0, 1f
58*4882a593Smuzhiyun	ldrb1	tmp1w, src, #1
59*4882a593Smuzhiyun	strb1	tmp1w, dst, #1
60*4882a593Smuzhiyun1:
61*4882a593Smuzhiyun	tbz	tmp2, #1, 2f
62*4882a593Smuzhiyun	ldrh1	tmp1w, src, #2
63*4882a593Smuzhiyun	strh1	tmp1w, dst, #2
64*4882a593Smuzhiyun2:
65*4882a593Smuzhiyun	tbz	tmp2, #2, 3f
66*4882a593Smuzhiyun	ldr1	tmp1w, src, #4
67*4882a593Smuzhiyun	str1	tmp1w, dst, #4
68*4882a593Smuzhiyun3:
69*4882a593Smuzhiyun	tbz	tmp2, #3, .LSrcAligned
70*4882a593Smuzhiyun	ldr1	tmp1, src, #8
71*4882a593Smuzhiyun	str1	tmp1, dst, #8
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun.LSrcAligned:
74*4882a593Smuzhiyun	cmp	count, #64
75*4882a593Smuzhiyun	b.ge	.Lcpy_over64
76*4882a593Smuzhiyun	/*
77*4882a593Smuzhiyun	* Deal with small copies quickly by dropping straight into the
78*4882a593Smuzhiyun	* exit block.
79*4882a593Smuzhiyun	*/
80*4882a593Smuzhiyun.Ltail63:
81*4882a593Smuzhiyun	/*
82*4882a593Smuzhiyun	* Copy up to 48 bytes of data. At this point we only need the
83*4882a593Smuzhiyun	* bottom 6 bits of count to be accurate.
84*4882a593Smuzhiyun	*/
85*4882a593Smuzhiyun	ands	tmp1, count, #0x30
86*4882a593Smuzhiyun	b.eq	.Ltiny15
87*4882a593Smuzhiyun	cmp	tmp1w, #0x20
88*4882a593Smuzhiyun	b.eq	1f
89*4882a593Smuzhiyun	b.lt	2f
90*4882a593Smuzhiyun	ldp1	A_l, A_h, src, #16
91*4882a593Smuzhiyun	stp1	A_l, A_h, dst, #16
92*4882a593Smuzhiyun1:
93*4882a593Smuzhiyun	ldp1	A_l, A_h, src, #16
94*4882a593Smuzhiyun	stp1	A_l, A_h, dst, #16
95*4882a593Smuzhiyun2:
96*4882a593Smuzhiyun	ldp1	A_l, A_h, src, #16
97*4882a593Smuzhiyun	stp1	A_l, A_h, dst, #16
98*4882a593Smuzhiyun.Ltiny15:
99*4882a593Smuzhiyun	/*
100*4882a593Smuzhiyun	* Prefer to break one ldp/stp into several load/store to access
101*4882a593Smuzhiyun	* memory in an increasing address order,rather than to load/store 16
102*4882a593Smuzhiyun	* bytes from (src-16) to (dst-16) and to backward the src to aligned
103*4882a593Smuzhiyun	* address,which way is used in original cortex memcpy. If keeping
104*4882a593Smuzhiyun	* the original memcpy process here, memmove need to satisfy the
105*4882a593Smuzhiyun	* precondition that src address is at least 16 bytes bigger than dst
106*4882a593Smuzhiyun	* address,otherwise some source data will be overwritten when memove
107*4882a593Smuzhiyun	* call memcpy directly. To make memmove simpler and decouple the
108*4882a593Smuzhiyun	* memcpy's dependency on memmove, withdrew the original process.
109*4882a593Smuzhiyun	*/
110*4882a593Smuzhiyun	tbz	count, #3, 1f
111*4882a593Smuzhiyun	ldr1	tmp1, src, #8
112*4882a593Smuzhiyun	str1	tmp1, dst, #8
113*4882a593Smuzhiyun1:
114*4882a593Smuzhiyun	tbz	count, #2, 2f
115*4882a593Smuzhiyun	ldr1	tmp1w, src, #4
116*4882a593Smuzhiyun	str1	tmp1w, dst, #4
117*4882a593Smuzhiyun2:
118*4882a593Smuzhiyun	tbz	count, #1, 3f
119*4882a593Smuzhiyun	ldrh1	tmp1w, src, #2
120*4882a593Smuzhiyun	strh1	tmp1w, dst, #2
121*4882a593Smuzhiyun3:
122*4882a593Smuzhiyun	tbz	count, #0, .Lexitfunc
123*4882a593Smuzhiyun	ldrb1	tmp1w, src, #1
124*4882a593Smuzhiyun	strb1	tmp1w, dst, #1
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun	b	.Lexitfunc
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun.Lcpy_over64:
129*4882a593Smuzhiyun	subs	count, count, #128
130*4882a593Smuzhiyun	b.ge	.Lcpy_body_large
131*4882a593Smuzhiyun	/*
132*4882a593Smuzhiyun	* Less than 128 bytes to copy, so handle 64 here and then jump
133*4882a593Smuzhiyun	* to the tail.
134*4882a593Smuzhiyun	*/
135*4882a593Smuzhiyun	ldp1	A_l, A_h, src, #16
136*4882a593Smuzhiyun	stp1	A_l, A_h, dst, #16
137*4882a593Smuzhiyun	ldp1	B_l, B_h, src, #16
138*4882a593Smuzhiyun	ldp1	C_l, C_h, src, #16
139*4882a593Smuzhiyun	stp1	B_l, B_h, dst, #16
140*4882a593Smuzhiyun	stp1	C_l, C_h, dst, #16
141*4882a593Smuzhiyun	ldp1	D_l, D_h, src, #16
142*4882a593Smuzhiyun	stp1	D_l, D_h, dst, #16
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun	tst	count, #0x3f
145*4882a593Smuzhiyun	b.ne	.Ltail63
146*4882a593Smuzhiyun	b	.Lexitfunc
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun	/*
149*4882a593Smuzhiyun	* Critical loop.  Start at a new cache line boundary.  Assuming
150*4882a593Smuzhiyun	* 64 bytes per line this ensures the entire loop is in one line.
151*4882a593Smuzhiyun	*/
152*4882a593Smuzhiyun	.p2align	L1_CACHE_SHIFT
153*4882a593Smuzhiyun.Lcpy_body_large:
154*4882a593Smuzhiyun	/* pre-get 64 bytes data. */
155*4882a593Smuzhiyun	ldp1	A_l, A_h, src, #16
156*4882a593Smuzhiyun	ldp1	B_l, B_h, src, #16
157*4882a593Smuzhiyun	ldp1	C_l, C_h, src, #16
158*4882a593Smuzhiyun	ldp1	D_l, D_h, src, #16
159*4882a593Smuzhiyun1:
160*4882a593Smuzhiyun	/*
161*4882a593Smuzhiyun	* interlace the load of next 64 bytes data block with store of the last
162*4882a593Smuzhiyun	* loaded 64 bytes data.
163*4882a593Smuzhiyun	*/
164*4882a593Smuzhiyun	stp1	A_l, A_h, dst, #16
165*4882a593Smuzhiyun	ldp1	A_l, A_h, src, #16
166*4882a593Smuzhiyun	stp1	B_l, B_h, dst, #16
167*4882a593Smuzhiyun	ldp1	B_l, B_h, src, #16
168*4882a593Smuzhiyun	stp1	C_l, C_h, dst, #16
169*4882a593Smuzhiyun	ldp1	C_l, C_h, src, #16
170*4882a593Smuzhiyun	stp1	D_l, D_h, dst, #16
171*4882a593Smuzhiyun	ldp1	D_l, D_h, src, #16
172*4882a593Smuzhiyun	subs	count, count, #64
173*4882a593Smuzhiyun	b.ge	1b
174*4882a593Smuzhiyun	stp1	A_l, A_h, dst, #16
175*4882a593Smuzhiyun	stp1	B_l, B_h, dst, #16
176*4882a593Smuzhiyun	stp1	C_l, C_h, dst, #16
177*4882a593Smuzhiyun	stp1	D_l, D_h, dst, #16
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun	tst	count, #0x3f
180*4882a593Smuzhiyun	b.ne	.Ltail63
181*4882a593Smuzhiyun.Lexitfunc:
182