xref: /OK3568_Linux_fs/kernel/arch/x86/lib/memmove_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Normally compiler builtins are used, but sometimes the compiler calls out
4*4882a593Smuzhiyun * of line code. Based on asm-i386/string.h.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This assembly file is re-written from memmove_64.c file.
7*4882a593Smuzhiyun *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun#include <linux/linkage.h>
10*4882a593Smuzhiyun#include <asm/cpufeatures.h>
11*4882a593Smuzhiyun#include <asm/alternative.h>
12*4882a593Smuzhiyun#include <asm/export.h>
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun#undef memmove
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun/*
17*4882a593Smuzhiyun * Implement memmove(). This can handle overlap between src and dst.
18*4882a593Smuzhiyun *
19*4882a593Smuzhiyun * Input:
20*4882a593Smuzhiyun * rdi: dest
21*4882a593Smuzhiyun * rsi: src
22*4882a593Smuzhiyun * rdx: count
23*4882a593Smuzhiyun *
24*4882a593Smuzhiyun * Output:
25*4882a593Smuzhiyun * rax: dest
26*4882a593Smuzhiyun */
27*4882a593SmuzhiyunSYM_FUNC_START_WEAK(memmove)
28*4882a593SmuzhiyunSYM_FUNC_START(__memmove)
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun	mov %rdi, %rax
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun	/* Decide forward/backward copy mode */
33*4882a593Smuzhiyun	cmp %rdi, %rsi
34*4882a593Smuzhiyun	jge .Lmemmove_begin_forward
35*4882a593Smuzhiyun	mov %rsi, %r8
36*4882a593Smuzhiyun	add %rdx, %r8
37*4882a593Smuzhiyun	cmp %rdi, %r8
38*4882a593Smuzhiyun	jg 2f
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun	/* FSRM implies ERMS => no length checks, do the copy directly */
41*4882a593Smuzhiyun.Lmemmove_begin_forward:
42*4882a593Smuzhiyun	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
43*4882a593Smuzhiyun	ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun	/*
46*4882a593Smuzhiyun	 * movsq instruction have many startup latency
47*4882a593Smuzhiyun	 * so we handle small size by general register.
48*4882a593Smuzhiyun	 */
49*4882a593Smuzhiyun	cmp  $680, %rdx
50*4882a593Smuzhiyun	jb	3f
51*4882a593Smuzhiyun	/*
52*4882a593Smuzhiyun	 * movsq instruction is only good for aligned case.
53*4882a593Smuzhiyun	 */
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun	cmpb %dil, %sil
56*4882a593Smuzhiyun	je 4f
57*4882a593Smuzhiyun3:
58*4882a593Smuzhiyun	sub $0x20, %rdx
59*4882a593Smuzhiyun	/*
60*4882a593Smuzhiyun	 * We gobble 32 bytes forward in each loop.
61*4882a593Smuzhiyun	 */
62*4882a593Smuzhiyun5:
63*4882a593Smuzhiyun	sub $0x20, %rdx
64*4882a593Smuzhiyun	movq 0*8(%rsi), %r11
65*4882a593Smuzhiyun	movq 1*8(%rsi), %r10
66*4882a593Smuzhiyun	movq 2*8(%rsi), %r9
67*4882a593Smuzhiyun	movq 3*8(%rsi), %r8
68*4882a593Smuzhiyun	leaq 4*8(%rsi), %rsi
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	movq %r11, 0*8(%rdi)
71*4882a593Smuzhiyun	movq %r10, 1*8(%rdi)
72*4882a593Smuzhiyun	movq %r9, 2*8(%rdi)
73*4882a593Smuzhiyun	movq %r8, 3*8(%rdi)
74*4882a593Smuzhiyun	leaq 4*8(%rdi), %rdi
75*4882a593Smuzhiyun	jae 5b
76*4882a593Smuzhiyun	addq $0x20, %rdx
77*4882a593Smuzhiyun	jmp 1f
78*4882a593Smuzhiyun	/*
79*4882a593Smuzhiyun	 * Handle data forward by movsq.
80*4882a593Smuzhiyun	 */
81*4882a593Smuzhiyun	.p2align 4
82*4882a593Smuzhiyun4:
83*4882a593Smuzhiyun	movq %rdx, %rcx
84*4882a593Smuzhiyun	movq -8(%rsi, %rdx), %r11
85*4882a593Smuzhiyun	lea -8(%rdi, %rdx), %r10
86*4882a593Smuzhiyun	shrq $3, %rcx
87*4882a593Smuzhiyun	rep movsq
88*4882a593Smuzhiyun	movq %r11, (%r10)
89*4882a593Smuzhiyun	jmp 13f
90*4882a593Smuzhiyun.Lmemmove_end_forward:
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun	/*
93*4882a593Smuzhiyun	 * Handle data backward by movsq.
94*4882a593Smuzhiyun	 */
95*4882a593Smuzhiyun	.p2align 4
96*4882a593Smuzhiyun7:
97*4882a593Smuzhiyun	movq %rdx, %rcx
98*4882a593Smuzhiyun	movq (%rsi), %r11
99*4882a593Smuzhiyun	movq %rdi, %r10
100*4882a593Smuzhiyun	leaq -8(%rsi, %rdx), %rsi
101*4882a593Smuzhiyun	leaq -8(%rdi, %rdx), %rdi
102*4882a593Smuzhiyun	shrq $3, %rcx
103*4882a593Smuzhiyun	std
104*4882a593Smuzhiyun	rep movsq
105*4882a593Smuzhiyun	cld
106*4882a593Smuzhiyun	movq %r11, (%r10)
107*4882a593Smuzhiyun	jmp 13f
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun	/*
110*4882a593Smuzhiyun	 * Start to prepare for backward copy.
111*4882a593Smuzhiyun	 */
112*4882a593Smuzhiyun	.p2align 4
113*4882a593Smuzhiyun2:
114*4882a593Smuzhiyun	cmp $0x20, %rdx
115*4882a593Smuzhiyun	jb 1f
116*4882a593Smuzhiyun	cmp $680, %rdx
117*4882a593Smuzhiyun	jb 6f
118*4882a593Smuzhiyun	cmp %dil, %sil
119*4882a593Smuzhiyun	je 7b
120*4882a593Smuzhiyun6:
121*4882a593Smuzhiyun	/*
122*4882a593Smuzhiyun	 * Calculate copy position to tail.
123*4882a593Smuzhiyun	 */
124*4882a593Smuzhiyun	addq %rdx, %rsi
125*4882a593Smuzhiyun	addq %rdx, %rdi
126*4882a593Smuzhiyun	subq $0x20, %rdx
127*4882a593Smuzhiyun	/*
128*4882a593Smuzhiyun	 * We gobble 32 bytes backward in each loop.
129*4882a593Smuzhiyun	 */
130*4882a593Smuzhiyun8:
131*4882a593Smuzhiyun	subq $0x20, %rdx
132*4882a593Smuzhiyun	movq -1*8(%rsi), %r11
133*4882a593Smuzhiyun	movq -2*8(%rsi), %r10
134*4882a593Smuzhiyun	movq -3*8(%rsi), %r9
135*4882a593Smuzhiyun	movq -4*8(%rsi), %r8
136*4882a593Smuzhiyun	leaq -4*8(%rsi), %rsi
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun	movq %r11, -1*8(%rdi)
139*4882a593Smuzhiyun	movq %r10, -2*8(%rdi)
140*4882a593Smuzhiyun	movq %r9, -3*8(%rdi)
141*4882a593Smuzhiyun	movq %r8, -4*8(%rdi)
142*4882a593Smuzhiyun	leaq -4*8(%rdi), %rdi
143*4882a593Smuzhiyun	jae 8b
144*4882a593Smuzhiyun	/*
145*4882a593Smuzhiyun	 * Calculate copy position to head.
146*4882a593Smuzhiyun	 */
147*4882a593Smuzhiyun	addq $0x20, %rdx
148*4882a593Smuzhiyun	subq %rdx, %rsi
149*4882a593Smuzhiyun	subq %rdx, %rdi
150*4882a593Smuzhiyun1:
151*4882a593Smuzhiyun	cmpq $16, %rdx
152*4882a593Smuzhiyun	jb 9f
153*4882a593Smuzhiyun	/*
154*4882a593Smuzhiyun	 * Move data from 16 bytes to 31 bytes.
155*4882a593Smuzhiyun	 */
156*4882a593Smuzhiyun	movq 0*8(%rsi), %r11
157*4882a593Smuzhiyun	movq 1*8(%rsi), %r10
158*4882a593Smuzhiyun	movq -2*8(%rsi, %rdx), %r9
159*4882a593Smuzhiyun	movq -1*8(%rsi, %rdx), %r8
160*4882a593Smuzhiyun	movq %r11, 0*8(%rdi)
161*4882a593Smuzhiyun	movq %r10, 1*8(%rdi)
162*4882a593Smuzhiyun	movq %r9, -2*8(%rdi, %rdx)
163*4882a593Smuzhiyun	movq %r8, -1*8(%rdi, %rdx)
164*4882a593Smuzhiyun	jmp 13f
165*4882a593Smuzhiyun	.p2align 4
166*4882a593Smuzhiyun9:
167*4882a593Smuzhiyun	cmpq $8, %rdx
168*4882a593Smuzhiyun	jb 10f
169*4882a593Smuzhiyun	/*
170*4882a593Smuzhiyun	 * Move data from 8 bytes to 15 bytes.
171*4882a593Smuzhiyun	 */
172*4882a593Smuzhiyun	movq 0*8(%rsi), %r11
173*4882a593Smuzhiyun	movq -1*8(%rsi, %rdx), %r10
174*4882a593Smuzhiyun	movq %r11, 0*8(%rdi)
175*4882a593Smuzhiyun	movq %r10, -1*8(%rdi, %rdx)
176*4882a593Smuzhiyun	jmp 13f
177*4882a593Smuzhiyun10:
178*4882a593Smuzhiyun	cmpq $4, %rdx
179*4882a593Smuzhiyun	jb 11f
180*4882a593Smuzhiyun	/*
181*4882a593Smuzhiyun	 * Move data from 4 bytes to 7 bytes.
182*4882a593Smuzhiyun	 */
183*4882a593Smuzhiyun	movl (%rsi), %r11d
184*4882a593Smuzhiyun	movl -4(%rsi, %rdx), %r10d
185*4882a593Smuzhiyun	movl %r11d, (%rdi)
186*4882a593Smuzhiyun	movl %r10d, -4(%rdi, %rdx)
187*4882a593Smuzhiyun	jmp 13f
188*4882a593Smuzhiyun11:
189*4882a593Smuzhiyun	cmp $2, %rdx
190*4882a593Smuzhiyun	jb 12f
191*4882a593Smuzhiyun	/*
192*4882a593Smuzhiyun	 * Move data from 2 bytes to 3 bytes.
193*4882a593Smuzhiyun	 */
194*4882a593Smuzhiyun	movw (%rsi), %r11w
195*4882a593Smuzhiyun	movw -2(%rsi, %rdx), %r10w
196*4882a593Smuzhiyun	movw %r11w, (%rdi)
197*4882a593Smuzhiyun	movw %r10w, -2(%rdi, %rdx)
198*4882a593Smuzhiyun	jmp 13f
199*4882a593Smuzhiyun12:
200*4882a593Smuzhiyun	cmp $1, %rdx
201*4882a593Smuzhiyun	jb 13f
202*4882a593Smuzhiyun	/*
203*4882a593Smuzhiyun	 * Move data for 1 byte.
204*4882a593Smuzhiyun	 */
205*4882a593Smuzhiyun	movb (%rsi), %r11b
206*4882a593Smuzhiyun	movb %r11b, (%rdi)
207*4882a593Smuzhiyun13:
208*4882a593Smuzhiyun	RET
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun.Lmemmove_erms:
211*4882a593Smuzhiyun	movq %rdx, %rcx
212*4882a593Smuzhiyun	rep movsb
213*4882a593Smuzhiyun	RET
214*4882a593SmuzhiyunSYM_FUNC_END(__memmove)
215*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(memmove)
216*4882a593SmuzhiyunEXPORT_SYMBOL(__memmove)
217*4882a593SmuzhiyunEXPORT_SYMBOL(memmove)
218