1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* Copyright 2002 Andi Kleen, SuSE Labs */ 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun#include <linux/linkage.h> 5*4882a593Smuzhiyun#include <asm/cpufeatures.h> 6*4882a593Smuzhiyun#include <asm/alternative.h> 7*4882a593Smuzhiyun#include <asm/export.h> 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun/* 10*4882a593Smuzhiyun * ISO C memset - set a memory block to a byte value. This function uses fast 11*4882a593Smuzhiyun * string to get better performance than the original function. The code is 12*4882a593Smuzhiyun * simpler and shorter than the original function as well. 13*4882a593Smuzhiyun * 14*4882a593Smuzhiyun * rdi destination 15*4882a593Smuzhiyun * rsi value (char) 16*4882a593Smuzhiyun * rdx count (bytes) 17*4882a593Smuzhiyun * 18*4882a593Smuzhiyun * rax original destination 19*4882a593Smuzhiyun */ 20*4882a593SmuzhiyunSYM_FUNC_START_WEAK(memset) 21*4882a593SmuzhiyunSYM_FUNC_START(__memset) 22*4882a593Smuzhiyun /* 23*4882a593Smuzhiyun * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 24*4882a593Smuzhiyun * to use it when possible. If not available, use fast string instructions. 25*4882a593Smuzhiyun * 26*4882a593Smuzhiyun * Otherwise, use original memset function. 27*4882a593Smuzhiyun */ 28*4882a593Smuzhiyun ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 29*4882a593Smuzhiyun "jmp memset_erms", X86_FEATURE_ERMS 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun movq %rdi,%r9 32*4882a593Smuzhiyun movq %rdx,%rcx 33*4882a593Smuzhiyun andl $7,%edx 34*4882a593Smuzhiyun shrq $3,%rcx 35*4882a593Smuzhiyun /* expand byte value */ 36*4882a593Smuzhiyun movzbl %sil,%esi 37*4882a593Smuzhiyun movabs $0x0101010101010101,%rax 38*4882a593Smuzhiyun imulq %rsi,%rax 39*4882a593Smuzhiyun rep stosq 40*4882a593Smuzhiyun movl %edx,%ecx 41*4882a593Smuzhiyun rep stosb 42*4882a593Smuzhiyun movq %r9,%rax 43*4882a593Smuzhiyun RET 44*4882a593SmuzhiyunSYM_FUNC_END(__memset) 45*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(memset) 46*4882a593SmuzhiyunEXPORT_SYMBOL(memset) 47*4882a593SmuzhiyunEXPORT_SYMBOL(__memset) 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun/* 50*4882a593Smuzhiyun * ISO C memset - set a memory block to a byte value. This function uses 51*4882a593Smuzhiyun * enhanced rep stosb to override the fast string function. 52*4882a593Smuzhiyun * The code is simpler and shorter than the fast string function as well. 53*4882a593Smuzhiyun * 54*4882a593Smuzhiyun * rdi destination 55*4882a593Smuzhiyun * rsi value (char) 56*4882a593Smuzhiyun * rdx count (bytes) 57*4882a593Smuzhiyun * 58*4882a593Smuzhiyun * rax original destination 59*4882a593Smuzhiyun */ 60*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(memset_erms) 61*4882a593Smuzhiyun movq %rdi,%r9 62*4882a593Smuzhiyun movb %sil,%al 63*4882a593Smuzhiyun movq %rdx,%rcx 64*4882a593Smuzhiyun rep stosb 65*4882a593Smuzhiyun movq %r9,%rax 66*4882a593Smuzhiyun RET 67*4882a593SmuzhiyunSYM_FUNC_END(memset_erms) 68*4882a593Smuzhiyun 69*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(memset_orig) 70*4882a593Smuzhiyun movq %rdi,%r10 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun /* expand byte value */ 73*4882a593Smuzhiyun movzbl %sil,%ecx 74*4882a593Smuzhiyun movabs $0x0101010101010101,%rax 75*4882a593Smuzhiyun imulq %rcx,%rax 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun /* align dst */ 78*4882a593Smuzhiyun movl %edi,%r9d 79*4882a593Smuzhiyun andl $7,%r9d 80*4882a593Smuzhiyun jnz .Lbad_alignment 81*4882a593Smuzhiyun.Lafter_bad_alignment: 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun movq %rdx,%rcx 84*4882a593Smuzhiyun shrq $6,%rcx 85*4882a593Smuzhiyun jz .Lhandle_tail 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun .p2align 4 88*4882a593Smuzhiyun.Lloop_64: 89*4882a593Smuzhiyun decq %rcx 90*4882a593Smuzhiyun movq %rax,(%rdi) 91*4882a593Smuzhiyun movq %rax,8(%rdi) 92*4882a593Smuzhiyun movq %rax,16(%rdi) 93*4882a593Smuzhiyun movq %rax,24(%rdi) 94*4882a593Smuzhiyun movq %rax,32(%rdi) 95*4882a593Smuzhiyun movq %rax,40(%rdi) 96*4882a593Smuzhiyun movq %rax,48(%rdi) 97*4882a593Smuzhiyun movq %rax,56(%rdi) 98*4882a593Smuzhiyun leaq 64(%rdi),%rdi 99*4882a593Smuzhiyun jnz .Lloop_64 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun /* Handle tail in loops. The loops should be faster than hard 102*4882a593Smuzhiyun to predict jump tables. */ 103*4882a593Smuzhiyun .p2align 4 104*4882a593Smuzhiyun.Lhandle_tail: 105*4882a593Smuzhiyun movl %edx,%ecx 106*4882a593Smuzhiyun andl $63&(~7),%ecx 107*4882a593Smuzhiyun jz .Lhandle_7 108*4882a593Smuzhiyun shrl $3,%ecx 109*4882a593Smuzhiyun .p2align 4 110*4882a593Smuzhiyun.Lloop_8: 111*4882a593Smuzhiyun decl %ecx 112*4882a593Smuzhiyun movq %rax,(%rdi) 113*4882a593Smuzhiyun leaq 8(%rdi),%rdi 114*4882a593Smuzhiyun jnz .Lloop_8 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun.Lhandle_7: 117*4882a593Smuzhiyun andl $7,%edx 118*4882a593Smuzhiyun jz .Lende 119*4882a593Smuzhiyun .p2align 4 120*4882a593Smuzhiyun.Lloop_1: 121*4882a593Smuzhiyun decl %edx 122*4882a593Smuzhiyun movb %al,(%rdi) 123*4882a593Smuzhiyun leaq 1(%rdi),%rdi 124*4882a593Smuzhiyun jnz .Lloop_1 125*4882a593Smuzhiyun 126*4882a593Smuzhiyun.Lende: 127*4882a593Smuzhiyun movq %r10,%rax 128*4882a593Smuzhiyun RET 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun.Lbad_alignment: 131*4882a593Smuzhiyun cmpq $7,%rdx 132*4882a593Smuzhiyun jbe .Lhandle_7 133*4882a593Smuzhiyun movq %rax,(%rdi) /* unaligned store */ 134*4882a593Smuzhiyun movq $8,%r8 135*4882a593Smuzhiyun subq %r9,%r8 136*4882a593Smuzhiyun addq %r8,%rdi 137*4882a593Smuzhiyun subq %r8,%rdx 138*4882a593Smuzhiyun jmp .Lafter_bad_alignment 139*4882a593Smuzhiyun.Lfinal: 140*4882a593SmuzhiyunSYM_FUNC_END(memset_orig) 141