xref: /OK3568_Linux_fs/kernel/arch/x86/lib/memset_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/* Copyright 2002 Andi Kleen, SuSE Labs */
3*4882a593Smuzhiyun
4*4882a593Smuzhiyun#include <linux/linkage.h>
5*4882a593Smuzhiyun#include <asm/cpufeatures.h>
6*4882a593Smuzhiyun#include <asm/alternative.h>
7*4882a593Smuzhiyun#include <asm/export.h>
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun/*
10*4882a593Smuzhiyun * ISO C memset - set a memory block to a byte value. This function uses fast
11*4882a593Smuzhiyun * string to get better performance than the original function. The code is
12*4882a593Smuzhiyun * simpler and shorter than the original function as well.
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * rdi   destination
15*4882a593Smuzhiyun * rsi   value (char)
16*4882a593Smuzhiyun * rdx   count (bytes)
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * rax   original destination
19*4882a593Smuzhiyun */
20*4882a593SmuzhiyunSYM_FUNC_START_WEAK(memset)
21*4882a593SmuzhiyunSYM_FUNC_START(__memset)
22*4882a593Smuzhiyun	/*
23*4882a593Smuzhiyun	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
24*4882a593Smuzhiyun	 * to use it when possible. If not available, use fast string instructions.
25*4882a593Smuzhiyun	 *
26*4882a593Smuzhiyun	 * Otherwise, use original memset function.
27*4882a593Smuzhiyun	 */
28*4882a593Smuzhiyun	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
29*4882a593Smuzhiyun		      "jmp memset_erms", X86_FEATURE_ERMS
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun	movq %rdi,%r9
32*4882a593Smuzhiyun	movq %rdx,%rcx
33*4882a593Smuzhiyun	andl $7,%edx
34*4882a593Smuzhiyun	shrq $3,%rcx
35*4882a593Smuzhiyun	/* expand byte value  */
36*4882a593Smuzhiyun	movzbl %sil,%esi
37*4882a593Smuzhiyun	movabs $0x0101010101010101,%rax
38*4882a593Smuzhiyun	imulq %rsi,%rax
39*4882a593Smuzhiyun	rep stosq
40*4882a593Smuzhiyun	movl %edx,%ecx
41*4882a593Smuzhiyun	rep stosb
42*4882a593Smuzhiyun	movq %r9,%rax
43*4882a593Smuzhiyun	RET
44*4882a593SmuzhiyunSYM_FUNC_END(__memset)
45*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(memset)
46*4882a593SmuzhiyunEXPORT_SYMBOL(memset)
47*4882a593SmuzhiyunEXPORT_SYMBOL(__memset)
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun/*
50*4882a593Smuzhiyun * ISO C memset - set a memory block to a byte value. This function uses
51*4882a593Smuzhiyun * enhanced rep stosb to override the fast string function.
52*4882a593Smuzhiyun * The code is simpler and shorter than the fast string function as well.
53*4882a593Smuzhiyun *
54*4882a593Smuzhiyun * rdi   destination
55*4882a593Smuzhiyun * rsi   value (char)
56*4882a593Smuzhiyun * rdx   count (bytes)
57*4882a593Smuzhiyun *
58*4882a593Smuzhiyun * rax   original destination
59*4882a593Smuzhiyun */
60*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(memset_erms)
61*4882a593Smuzhiyun	movq %rdi,%r9
62*4882a593Smuzhiyun	movb %sil,%al
63*4882a593Smuzhiyun	movq %rdx,%rcx
64*4882a593Smuzhiyun	rep stosb
65*4882a593Smuzhiyun	movq %r9,%rax
66*4882a593Smuzhiyun	RET
67*4882a593SmuzhiyunSYM_FUNC_END(memset_erms)
68*4882a593Smuzhiyun
69*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(memset_orig)
70*4882a593Smuzhiyun	movq %rdi,%r10
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun	/* expand byte value  */
73*4882a593Smuzhiyun	movzbl %sil,%ecx
74*4882a593Smuzhiyun	movabs $0x0101010101010101,%rax
75*4882a593Smuzhiyun	imulq  %rcx,%rax
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun	/* align dst */
78*4882a593Smuzhiyun	movl  %edi,%r9d
79*4882a593Smuzhiyun	andl  $7,%r9d
80*4882a593Smuzhiyun	jnz  .Lbad_alignment
81*4882a593Smuzhiyun.Lafter_bad_alignment:
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	movq  %rdx,%rcx
84*4882a593Smuzhiyun	shrq  $6,%rcx
85*4882a593Smuzhiyun	jz	 .Lhandle_tail
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun	.p2align 4
88*4882a593Smuzhiyun.Lloop_64:
89*4882a593Smuzhiyun	decq  %rcx
90*4882a593Smuzhiyun	movq  %rax,(%rdi)
91*4882a593Smuzhiyun	movq  %rax,8(%rdi)
92*4882a593Smuzhiyun	movq  %rax,16(%rdi)
93*4882a593Smuzhiyun	movq  %rax,24(%rdi)
94*4882a593Smuzhiyun	movq  %rax,32(%rdi)
95*4882a593Smuzhiyun	movq  %rax,40(%rdi)
96*4882a593Smuzhiyun	movq  %rax,48(%rdi)
97*4882a593Smuzhiyun	movq  %rax,56(%rdi)
98*4882a593Smuzhiyun	leaq  64(%rdi),%rdi
99*4882a593Smuzhiyun	jnz    .Lloop_64
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun	/* Handle tail in loops. The loops should be faster than hard
102*4882a593Smuzhiyun	   to predict jump tables. */
103*4882a593Smuzhiyun	.p2align 4
104*4882a593Smuzhiyun.Lhandle_tail:
105*4882a593Smuzhiyun	movl	%edx,%ecx
106*4882a593Smuzhiyun	andl    $63&(~7),%ecx
107*4882a593Smuzhiyun	jz 		.Lhandle_7
108*4882a593Smuzhiyun	shrl	$3,%ecx
109*4882a593Smuzhiyun	.p2align 4
110*4882a593Smuzhiyun.Lloop_8:
111*4882a593Smuzhiyun	decl   %ecx
112*4882a593Smuzhiyun	movq  %rax,(%rdi)
113*4882a593Smuzhiyun	leaq  8(%rdi),%rdi
114*4882a593Smuzhiyun	jnz    .Lloop_8
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun.Lhandle_7:
117*4882a593Smuzhiyun	andl	$7,%edx
118*4882a593Smuzhiyun	jz      .Lende
119*4882a593Smuzhiyun	.p2align 4
120*4882a593Smuzhiyun.Lloop_1:
121*4882a593Smuzhiyun	decl    %edx
122*4882a593Smuzhiyun	movb 	%al,(%rdi)
123*4882a593Smuzhiyun	leaq	1(%rdi),%rdi
124*4882a593Smuzhiyun	jnz     .Lloop_1
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun.Lende:
127*4882a593Smuzhiyun	movq	%r10,%rax
128*4882a593Smuzhiyun	RET
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun.Lbad_alignment:
131*4882a593Smuzhiyun	cmpq $7,%rdx
132*4882a593Smuzhiyun	jbe	.Lhandle_7
133*4882a593Smuzhiyun	movq %rax,(%rdi)	/* unaligned store */
134*4882a593Smuzhiyun	movq $8,%r8
135*4882a593Smuzhiyun	subq %r9,%r8
136*4882a593Smuzhiyun	addq %r8,%rdi
137*4882a593Smuzhiyun	subq %r8,%rdx
138*4882a593Smuzhiyun	jmp .Lafter_bad_alignment
139*4882a593Smuzhiyun.Lfinal:
140*4882a593SmuzhiyunSYM_FUNC_END(memset_orig)
141