1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* Copyright 2002 Andi Kleen */ 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun#include <linux/linkage.h> 5*4882a593Smuzhiyun#include <asm/errno.h> 6*4882a593Smuzhiyun#include <asm/cpufeatures.h> 7*4882a593Smuzhiyun#include <asm/alternative.h> 8*4882a593Smuzhiyun#include <asm/export.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun.pushsection .noinstr.text, "ax" 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun/* 13*4882a593Smuzhiyun * We build a jump to memcpy_orig by default which gets NOPped out on 14*4882a593Smuzhiyun * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 15*4882a593Smuzhiyun * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 16*4882a593Smuzhiyun * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 17*4882a593Smuzhiyun */ 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun/* 20*4882a593Smuzhiyun * memcpy - Copy a memory block. 21*4882a593Smuzhiyun * 22*4882a593Smuzhiyun * Input: 23*4882a593Smuzhiyun * rdi destination 24*4882a593Smuzhiyun * rsi source 25*4882a593Smuzhiyun * rdx count 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * Output: 28*4882a593Smuzhiyun * rax original destination 29*4882a593Smuzhiyun */ 30*4882a593SmuzhiyunSYM_FUNC_START_ALIAS(__memcpy) 31*4882a593SmuzhiyunSYM_FUNC_START_WEAK(memcpy) 32*4882a593Smuzhiyun ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 33*4882a593Smuzhiyun "jmp memcpy_erms", X86_FEATURE_ERMS 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun movq %rdi, %rax 36*4882a593Smuzhiyun movq %rdx, %rcx 37*4882a593Smuzhiyun shrq $3, %rcx 38*4882a593Smuzhiyun andl $7, %edx 39*4882a593Smuzhiyun rep movsq 40*4882a593Smuzhiyun movl %edx, %ecx 41*4882a593Smuzhiyun rep movsb 42*4882a593Smuzhiyun RET 43*4882a593SmuzhiyunSYM_FUNC_END(memcpy) 44*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(__memcpy) 45*4882a593SmuzhiyunEXPORT_SYMBOL(memcpy) 46*4882a593SmuzhiyunEXPORT_SYMBOL(__memcpy) 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun/* 49*4882a593Smuzhiyun * memcpy_erms() - enhanced fast string memcpy. This is faster and 50*4882a593Smuzhiyun * simpler than memcpy. Use memcpy_erms when possible. 51*4882a593Smuzhiyun */ 52*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(memcpy_erms) 53*4882a593Smuzhiyun movq %rdi, %rax 54*4882a593Smuzhiyun movq %rdx, %rcx 55*4882a593Smuzhiyun rep movsb 56*4882a593Smuzhiyun RET 57*4882a593SmuzhiyunSYM_FUNC_END(memcpy_erms) 58*4882a593Smuzhiyun 59*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(memcpy_orig) 60*4882a593Smuzhiyun movq %rdi, %rax 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun cmpq $0x20, %rdx 63*4882a593Smuzhiyun jb .Lhandle_tail 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun /* 66*4882a593Smuzhiyun * We check whether memory false dependence could occur, 67*4882a593Smuzhiyun * then jump to corresponding copy mode. 68*4882a593Smuzhiyun */ 69*4882a593Smuzhiyun cmp %dil, %sil 70*4882a593Smuzhiyun jl .Lcopy_backward 71*4882a593Smuzhiyun subq $0x20, %rdx 72*4882a593Smuzhiyun.Lcopy_forward_loop: 73*4882a593Smuzhiyun subq $0x20, %rdx 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun /* 76*4882a593Smuzhiyun * Move in blocks of 4x8 bytes: 77*4882a593Smuzhiyun */ 78*4882a593Smuzhiyun movq 0*8(%rsi), %r8 79*4882a593Smuzhiyun movq 1*8(%rsi), %r9 80*4882a593Smuzhiyun movq 2*8(%rsi), %r10 81*4882a593Smuzhiyun movq 3*8(%rsi), %r11 82*4882a593Smuzhiyun leaq 4*8(%rsi), %rsi 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun movq %r8, 0*8(%rdi) 85*4882a593Smuzhiyun movq %r9, 1*8(%rdi) 86*4882a593Smuzhiyun movq %r10, 2*8(%rdi) 87*4882a593Smuzhiyun movq %r11, 3*8(%rdi) 88*4882a593Smuzhiyun leaq 4*8(%rdi), %rdi 89*4882a593Smuzhiyun jae .Lcopy_forward_loop 90*4882a593Smuzhiyun addl $0x20, %edx 91*4882a593Smuzhiyun jmp .Lhandle_tail 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun.Lcopy_backward: 94*4882a593Smuzhiyun /* 95*4882a593Smuzhiyun * Calculate copy position to tail. 96*4882a593Smuzhiyun */ 97*4882a593Smuzhiyun addq %rdx, %rsi 98*4882a593Smuzhiyun addq %rdx, %rdi 99*4882a593Smuzhiyun subq $0x20, %rdx 100*4882a593Smuzhiyun /* 101*4882a593Smuzhiyun * At most 3 ALU operations in one cycle, 102*4882a593Smuzhiyun * so append NOPS in the same 16 bytes trunk. 103*4882a593Smuzhiyun */ 104*4882a593Smuzhiyun .p2align 4 105*4882a593Smuzhiyun.Lcopy_backward_loop: 106*4882a593Smuzhiyun subq $0x20, %rdx 107*4882a593Smuzhiyun movq -1*8(%rsi), %r8 108*4882a593Smuzhiyun movq -2*8(%rsi), %r9 109*4882a593Smuzhiyun movq -3*8(%rsi), %r10 110*4882a593Smuzhiyun movq -4*8(%rsi), %r11 111*4882a593Smuzhiyun leaq -4*8(%rsi), %rsi 112*4882a593Smuzhiyun movq %r8, -1*8(%rdi) 113*4882a593Smuzhiyun movq %r9, -2*8(%rdi) 114*4882a593Smuzhiyun movq %r10, -3*8(%rdi) 115*4882a593Smuzhiyun movq %r11, -4*8(%rdi) 116*4882a593Smuzhiyun leaq -4*8(%rdi), %rdi 117*4882a593Smuzhiyun jae .Lcopy_backward_loop 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun /* 120*4882a593Smuzhiyun * Calculate copy position to head. 121*4882a593Smuzhiyun */ 122*4882a593Smuzhiyun addl $0x20, %edx 123*4882a593Smuzhiyun subq %rdx, %rsi 124*4882a593Smuzhiyun subq %rdx, %rdi 125*4882a593Smuzhiyun.Lhandle_tail: 126*4882a593Smuzhiyun cmpl $16, %edx 127*4882a593Smuzhiyun jb .Lless_16bytes 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun /* 130*4882a593Smuzhiyun * Move data from 16 bytes to 31 bytes. 131*4882a593Smuzhiyun */ 132*4882a593Smuzhiyun movq 0*8(%rsi), %r8 133*4882a593Smuzhiyun movq 1*8(%rsi), %r9 134*4882a593Smuzhiyun movq -2*8(%rsi, %rdx), %r10 135*4882a593Smuzhiyun movq -1*8(%rsi, %rdx), %r11 136*4882a593Smuzhiyun movq %r8, 0*8(%rdi) 137*4882a593Smuzhiyun movq %r9, 1*8(%rdi) 138*4882a593Smuzhiyun movq %r10, -2*8(%rdi, %rdx) 139*4882a593Smuzhiyun movq %r11, -1*8(%rdi, %rdx) 140*4882a593Smuzhiyun RET 141*4882a593Smuzhiyun .p2align 4 142*4882a593Smuzhiyun.Lless_16bytes: 143*4882a593Smuzhiyun cmpl $8, %edx 144*4882a593Smuzhiyun jb .Lless_8bytes 145*4882a593Smuzhiyun /* 146*4882a593Smuzhiyun * Move data from 8 bytes to 15 bytes. 147*4882a593Smuzhiyun */ 148*4882a593Smuzhiyun movq 0*8(%rsi), %r8 149*4882a593Smuzhiyun movq -1*8(%rsi, %rdx), %r9 150*4882a593Smuzhiyun movq %r8, 0*8(%rdi) 151*4882a593Smuzhiyun movq %r9, -1*8(%rdi, %rdx) 152*4882a593Smuzhiyun RET 153*4882a593Smuzhiyun .p2align 4 154*4882a593Smuzhiyun.Lless_8bytes: 155*4882a593Smuzhiyun cmpl $4, %edx 156*4882a593Smuzhiyun jb .Lless_3bytes 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun /* 159*4882a593Smuzhiyun * Move data from 4 bytes to 7 bytes. 160*4882a593Smuzhiyun */ 161*4882a593Smuzhiyun movl (%rsi), %ecx 162*4882a593Smuzhiyun movl -4(%rsi, %rdx), %r8d 163*4882a593Smuzhiyun movl %ecx, (%rdi) 164*4882a593Smuzhiyun movl %r8d, -4(%rdi, %rdx) 165*4882a593Smuzhiyun RET 166*4882a593Smuzhiyun .p2align 4 167*4882a593Smuzhiyun.Lless_3bytes: 168*4882a593Smuzhiyun subl $1, %edx 169*4882a593Smuzhiyun jb .Lend 170*4882a593Smuzhiyun /* 171*4882a593Smuzhiyun * Move data from 1 bytes to 3 bytes. 172*4882a593Smuzhiyun */ 173*4882a593Smuzhiyun movzbl (%rsi), %ecx 174*4882a593Smuzhiyun jz .Lstore_1byte 175*4882a593Smuzhiyun movzbq 1(%rsi), %r8 176*4882a593Smuzhiyun movzbq (%rsi, %rdx), %r9 177*4882a593Smuzhiyun movb %r8b, 1(%rdi) 178*4882a593Smuzhiyun movb %r9b, (%rdi, %rdx) 179*4882a593Smuzhiyun.Lstore_1byte: 180*4882a593Smuzhiyun movb %cl, (%rdi) 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun.Lend: 183*4882a593Smuzhiyun RET 184*4882a593SmuzhiyunSYM_FUNC_END(memcpy_orig) 185*4882a593Smuzhiyun 186*4882a593Smuzhiyun.popsection 187