xref: /OK3568_Linux_fs/kernel/arch/arm64/lib/memmove.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd.
4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro
7*4882a593Smuzhiyun * be found @
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10*4882a593Smuzhiyun * files/head:/src/aarch64/
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#include <linux/linkage.h>
14*4882a593Smuzhiyun#include <asm/assembler.h>
15*4882a593Smuzhiyun#include <asm/cache.h>
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun/*
18*4882a593Smuzhiyun * Move a buffer from src to test (alignment handled by the hardware).
19*4882a593Smuzhiyun * If dest <= src, call memcpy, otherwise copy in reverse order.
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * Parameters:
22*4882a593Smuzhiyun *	x0 - dest
23*4882a593Smuzhiyun *	x1 - src
24*4882a593Smuzhiyun *	x2 - n
25*4882a593Smuzhiyun * Returns:
26*4882a593Smuzhiyun *	x0 - dest
27*4882a593Smuzhiyun */
28*4882a593Smuzhiyundstin	.req	x0
29*4882a593Smuzhiyunsrc	.req	x1
30*4882a593Smuzhiyuncount	.req	x2
31*4882a593Smuzhiyuntmp1	.req	x3
32*4882a593Smuzhiyuntmp1w	.req	w3
33*4882a593Smuzhiyuntmp2	.req	x4
34*4882a593Smuzhiyuntmp2w	.req	w4
35*4882a593Smuzhiyuntmp3	.req	x5
36*4882a593Smuzhiyuntmp3w	.req	w5
37*4882a593Smuzhiyundst	.req	x6
38*4882a593Smuzhiyun
39*4882a593SmuzhiyunA_l	.req	x7
40*4882a593SmuzhiyunA_h	.req	x8
41*4882a593SmuzhiyunB_l	.req	x9
42*4882a593SmuzhiyunB_h	.req	x10
43*4882a593SmuzhiyunC_l	.req	x11
44*4882a593SmuzhiyunC_h	.req	x12
45*4882a593SmuzhiyunD_l	.req	x13
46*4882a593SmuzhiyunD_h	.req	x14
47*4882a593Smuzhiyun
48*4882a593SmuzhiyunSYM_FUNC_START_ALIAS(__memmove)
49*4882a593SmuzhiyunSYM_FUNC_START_WEAK_PI(memmove)
50*4882a593Smuzhiyun	cmp	dstin, src
51*4882a593Smuzhiyun	b.lo	__memcpy
52*4882a593Smuzhiyun	add	tmp1, src, count
53*4882a593Smuzhiyun	cmp	dstin, tmp1
54*4882a593Smuzhiyun	b.hs	__memcpy		/* No overlap.  */
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun	add	dst, dstin, count
57*4882a593Smuzhiyun	add	src, src, count
58*4882a593Smuzhiyun	cmp	count, #16
59*4882a593Smuzhiyun	b.lo	.Ltail15  /*probably non-alignment accesses.*/
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
62*4882a593Smuzhiyun	b.eq	.LSrcAligned
63*4882a593Smuzhiyun	sub	count, count, tmp2
64*4882a593Smuzhiyun	/*
65*4882a593Smuzhiyun	* process the aligned offset length to make the src aligned firstly.
66*4882a593Smuzhiyun	* those extra instructions' cost is acceptable. It also make the
67*4882a593Smuzhiyun	* coming accesses are based on aligned address.
68*4882a593Smuzhiyun	*/
69*4882a593Smuzhiyun	tbz	tmp2, #0, 1f
70*4882a593Smuzhiyun	ldrb	tmp1w, [src, #-1]!
71*4882a593Smuzhiyun	strb	tmp1w, [dst, #-1]!
72*4882a593Smuzhiyun1:
73*4882a593Smuzhiyun	tbz	tmp2, #1, 2f
74*4882a593Smuzhiyun	ldrh	tmp1w, [src, #-2]!
75*4882a593Smuzhiyun	strh	tmp1w, [dst, #-2]!
76*4882a593Smuzhiyun2:
77*4882a593Smuzhiyun	tbz	tmp2, #2, 3f
78*4882a593Smuzhiyun	ldr	tmp1w, [src, #-4]!
79*4882a593Smuzhiyun	str	tmp1w, [dst, #-4]!
80*4882a593Smuzhiyun3:
81*4882a593Smuzhiyun	tbz	tmp2, #3, .LSrcAligned
82*4882a593Smuzhiyun	ldr	tmp1, [src, #-8]!
83*4882a593Smuzhiyun	str	tmp1, [dst, #-8]!
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun.LSrcAligned:
86*4882a593Smuzhiyun	cmp	count, #64
87*4882a593Smuzhiyun	b.ge	.Lcpy_over64
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun	/*
90*4882a593Smuzhiyun	* Deal with small copies quickly by dropping straight into the
91*4882a593Smuzhiyun	* exit block.
92*4882a593Smuzhiyun	*/
93*4882a593Smuzhiyun.Ltail63:
94*4882a593Smuzhiyun	/*
95*4882a593Smuzhiyun	* Copy up to 48 bytes of data. At this point we only need the
96*4882a593Smuzhiyun	* bottom 6 bits of count to be accurate.
97*4882a593Smuzhiyun	*/
98*4882a593Smuzhiyun	ands	tmp1, count, #0x30
99*4882a593Smuzhiyun	b.eq	.Ltail15
100*4882a593Smuzhiyun	cmp	tmp1w, #0x20
101*4882a593Smuzhiyun	b.eq	1f
102*4882a593Smuzhiyun	b.lt	2f
103*4882a593Smuzhiyun	ldp	A_l, A_h, [src, #-16]!
104*4882a593Smuzhiyun	stp	A_l, A_h, [dst, #-16]!
105*4882a593Smuzhiyun1:
106*4882a593Smuzhiyun	ldp	A_l, A_h, [src, #-16]!
107*4882a593Smuzhiyun	stp	A_l, A_h, [dst, #-16]!
108*4882a593Smuzhiyun2:
109*4882a593Smuzhiyun	ldp	A_l, A_h, [src, #-16]!
110*4882a593Smuzhiyun	stp	A_l, A_h, [dst, #-16]!
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun.Ltail15:
113*4882a593Smuzhiyun	tbz	count, #3, 1f
114*4882a593Smuzhiyun	ldr	tmp1, [src, #-8]!
115*4882a593Smuzhiyun	str	tmp1, [dst, #-8]!
116*4882a593Smuzhiyun1:
117*4882a593Smuzhiyun	tbz	count, #2, 2f
118*4882a593Smuzhiyun	ldr	tmp1w, [src, #-4]!
119*4882a593Smuzhiyun	str	tmp1w, [dst, #-4]!
120*4882a593Smuzhiyun2:
121*4882a593Smuzhiyun	tbz	count, #1, 3f
122*4882a593Smuzhiyun	ldrh	tmp1w, [src, #-2]!
123*4882a593Smuzhiyun	strh	tmp1w, [dst, #-2]!
124*4882a593Smuzhiyun3:
125*4882a593Smuzhiyun	tbz	count, #0, .Lexitfunc
126*4882a593Smuzhiyun	ldrb	tmp1w, [src, #-1]
127*4882a593Smuzhiyun	strb	tmp1w, [dst, #-1]
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun.Lexitfunc:
130*4882a593Smuzhiyun	ret
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun.Lcpy_over64:
133*4882a593Smuzhiyun	subs	count, count, #128
134*4882a593Smuzhiyun	b.ge	.Lcpy_body_large
135*4882a593Smuzhiyun	/*
136*4882a593Smuzhiyun	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
137*4882a593Smuzhiyun	* to the tail.
138*4882a593Smuzhiyun	*/
139*4882a593Smuzhiyun	ldp	A_l, A_h, [src, #-16]
140*4882a593Smuzhiyun	stp	A_l, A_h, [dst, #-16]
141*4882a593Smuzhiyun	ldp	B_l, B_h, [src, #-32]
142*4882a593Smuzhiyun	ldp	C_l, C_h, [src, #-48]
143*4882a593Smuzhiyun	stp	B_l, B_h, [dst, #-32]
144*4882a593Smuzhiyun	stp	C_l, C_h, [dst, #-48]
145*4882a593Smuzhiyun	ldp	D_l, D_h, [src, #-64]!
146*4882a593Smuzhiyun	stp	D_l, D_h, [dst, #-64]!
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun	tst	count, #0x3f
149*4882a593Smuzhiyun	b.ne	.Ltail63
150*4882a593Smuzhiyun	ret
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun	/*
153*4882a593Smuzhiyun	* Critical loop. Start at a new cache line boundary. Assuming
154*4882a593Smuzhiyun	* 64 bytes per line this ensures the entire loop is in one line.
155*4882a593Smuzhiyun	*/
156*4882a593Smuzhiyun	.p2align	L1_CACHE_SHIFT
157*4882a593Smuzhiyun.Lcpy_body_large:
158*4882a593Smuzhiyun	/* pre-load 64 bytes data. */
159*4882a593Smuzhiyun	ldp	A_l, A_h, [src, #-16]
160*4882a593Smuzhiyun	ldp	B_l, B_h, [src, #-32]
161*4882a593Smuzhiyun	ldp	C_l, C_h, [src, #-48]
162*4882a593Smuzhiyun	ldp	D_l, D_h, [src, #-64]!
163*4882a593Smuzhiyun1:
164*4882a593Smuzhiyun	/*
165*4882a593Smuzhiyun	* interlace the load of next 64 bytes data block with store of the last
166*4882a593Smuzhiyun	* loaded 64 bytes data.
167*4882a593Smuzhiyun	*/
168*4882a593Smuzhiyun	stp	A_l, A_h, [dst, #-16]
169*4882a593Smuzhiyun	ldp	A_l, A_h, [src, #-16]
170*4882a593Smuzhiyun	stp	B_l, B_h, [dst, #-32]
171*4882a593Smuzhiyun	ldp	B_l, B_h, [src, #-32]
172*4882a593Smuzhiyun	stp	C_l, C_h, [dst, #-48]
173*4882a593Smuzhiyun	ldp	C_l, C_h, [src, #-48]
174*4882a593Smuzhiyun	stp	D_l, D_h, [dst, #-64]!
175*4882a593Smuzhiyun	ldp	D_l, D_h, [src, #-64]!
176*4882a593Smuzhiyun	subs	count, count, #64
177*4882a593Smuzhiyun	b.ge	1b
178*4882a593Smuzhiyun	stp	A_l, A_h, [dst, #-16]
179*4882a593Smuzhiyun	stp	B_l, B_h, [dst, #-32]
180*4882a593Smuzhiyun	stp	C_l, C_h, [dst, #-48]
181*4882a593Smuzhiyun	stp	D_l, D_h, [dst, #-64]!
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun	tst	count, #0x3f
184*4882a593Smuzhiyun	b.ne	.Ltail63
185*4882a593Smuzhiyun	ret
186*4882a593SmuzhiyunSYM_FUNC_END_PI(memmove)
187*4882a593SmuzhiyunEXPORT_SYMBOL(memmove)
188*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(__memmove)
189*4882a593SmuzhiyunEXPORT_SYMBOL(__memmove)
190