1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Normally compiler builtins are used, but sometimes the compiler calls out 4*4882a593Smuzhiyun * of line code. Based on asm-i386/string.h. 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * This assembly file is re-written from memmove_64.c file. 7*4882a593Smuzhiyun * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun#include <linux/linkage.h> 10*4882a593Smuzhiyun#include <asm/cpufeatures.h> 11*4882a593Smuzhiyun#include <asm/alternative.h> 12*4882a593Smuzhiyun#include <asm/export.h> 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun#undef memmove 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun/* 17*4882a593Smuzhiyun * Implement memmove(). This can handle overlap between src and dst. 18*4882a593Smuzhiyun * 19*4882a593Smuzhiyun * Input: 20*4882a593Smuzhiyun * rdi: dest 21*4882a593Smuzhiyun * rsi: src 22*4882a593Smuzhiyun * rdx: count 23*4882a593Smuzhiyun * 24*4882a593Smuzhiyun * Output: 25*4882a593Smuzhiyun * rax: dest 26*4882a593Smuzhiyun */ 27*4882a593SmuzhiyunSYM_FUNC_START_WEAK(memmove) 28*4882a593SmuzhiyunSYM_FUNC_START(__memmove) 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun mov %rdi, %rax 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun /* Decide forward/backward copy mode */ 33*4882a593Smuzhiyun cmp %rdi, %rsi 34*4882a593Smuzhiyun jge .Lmemmove_begin_forward 35*4882a593Smuzhiyun mov %rsi, %r8 36*4882a593Smuzhiyun add %rdx, %r8 37*4882a593Smuzhiyun cmp %rdi, %r8 38*4882a593Smuzhiyun jg 2f 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun /* FSRM implies ERMS => no length checks, do the copy directly */ 41*4882a593Smuzhiyun.Lmemmove_begin_forward: 42*4882a593Smuzhiyun ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM 43*4882a593Smuzhiyun ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun /* 46*4882a593Smuzhiyun * movsq instruction have many startup latency 47*4882a593Smuzhiyun * so we handle small size by general register. 48*4882a593Smuzhiyun */ 49*4882a593Smuzhiyun cmp $680, %rdx 50*4882a593Smuzhiyun jb 3f 51*4882a593Smuzhiyun /* 52*4882a593Smuzhiyun * movsq instruction is only good for aligned case. 53*4882a593Smuzhiyun */ 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun cmpb %dil, %sil 56*4882a593Smuzhiyun je 4f 57*4882a593Smuzhiyun3: 58*4882a593Smuzhiyun sub $0x20, %rdx 59*4882a593Smuzhiyun /* 60*4882a593Smuzhiyun * We gobble 32 bytes forward in each loop. 61*4882a593Smuzhiyun */ 62*4882a593Smuzhiyun5: 63*4882a593Smuzhiyun sub $0x20, %rdx 64*4882a593Smuzhiyun movq 0*8(%rsi), %r11 65*4882a593Smuzhiyun movq 1*8(%rsi), %r10 66*4882a593Smuzhiyun movq 2*8(%rsi), %r9 67*4882a593Smuzhiyun movq 3*8(%rsi), %r8 68*4882a593Smuzhiyun leaq 4*8(%rsi), %rsi 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun movq %r11, 0*8(%rdi) 71*4882a593Smuzhiyun movq %r10, 1*8(%rdi) 72*4882a593Smuzhiyun movq %r9, 2*8(%rdi) 73*4882a593Smuzhiyun movq %r8, 3*8(%rdi) 74*4882a593Smuzhiyun leaq 4*8(%rdi), %rdi 75*4882a593Smuzhiyun jae 5b 76*4882a593Smuzhiyun addq $0x20, %rdx 77*4882a593Smuzhiyun jmp 1f 78*4882a593Smuzhiyun /* 79*4882a593Smuzhiyun * Handle data forward by movsq. 80*4882a593Smuzhiyun */ 81*4882a593Smuzhiyun .p2align 4 82*4882a593Smuzhiyun4: 83*4882a593Smuzhiyun movq %rdx, %rcx 84*4882a593Smuzhiyun movq -8(%rsi, %rdx), %r11 85*4882a593Smuzhiyun lea -8(%rdi, %rdx), %r10 86*4882a593Smuzhiyun shrq $3, %rcx 87*4882a593Smuzhiyun rep movsq 88*4882a593Smuzhiyun movq %r11, (%r10) 89*4882a593Smuzhiyun jmp 13f 90*4882a593Smuzhiyun.Lmemmove_end_forward: 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun /* 93*4882a593Smuzhiyun * Handle data backward by movsq. 94*4882a593Smuzhiyun */ 95*4882a593Smuzhiyun .p2align 4 96*4882a593Smuzhiyun7: 97*4882a593Smuzhiyun movq %rdx, %rcx 98*4882a593Smuzhiyun movq (%rsi), %r11 99*4882a593Smuzhiyun movq %rdi, %r10 100*4882a593Smuzhiyun leaq -8(%rsi, %rdx), %rsi 101*4882a593Smuzhiyun leaq -8(%rdi, %rdx), %rdi 102*4882a593Smuzhiyun shrq $3, %rcx 103*4882a593Smuzhiyun std 104*4882a593Smuzhiyun rep movsq 105*4882a593Smuzhiyun cld 106*4882a593Smuzhiyun movq %r11, (%r10) 107*4882a593Smuzhiyun jmp 13f 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun /* 110*4882a593Smuzhiyun * Start to prepare for backward copy. 111*4882a593Smuzhiyun */ 112*4882a593Smuzhiyun .p2align 4 113*4882a593Smuzhiyun2: 114*4882a593Smuzhiyun cmp $0x20, %rdx 115*4882a593Smuzhiyun jb 1f 116*4882a593Smuzhiyun cmp $680, %rdx 117*4882a593Smuzhiyun jb 6f 118*4882a593Smuzhiyun cmp %dil, %sil 119*4882a593Smuzhiyun je 7b 120*4882a593Smuzhiyun6: 121*4882a593Smuzhiyun /* 122*4882a593Smuzhiyun * Calculate copy position to tail. 123*4882a593Smuzhiyun */ 124*4882a593Smuzhiyun addq %rdx, %rsi 125*4882a593Smuzhiyun addq %rdx, %rdi 126*4882a593Smuzhiyun subq $0x20, %rdx 127*4882a593Smuzhiyun /* 128*4882a593Smuzhiyun * We gobble 32 bytes backward in each loop. 129*4882a593Smuzhiyun */ 130*4882a593Smuzhiyun8: 131*4882a593Smuzhiyun subq $0x20, %rdx 132*4882a593Smuzhiyun movq -1*8(%rsi), %r11 133*4882a593Smuzhiyun movq -2*8(%rsi), %r10 134*4882a593Smuzhiyun movq -3*8(%rsi), %r9 135*4882a593Smuzhiyun movq -4*8(%rsi), %r8 136*4882a593Smuzhiyun leaq -4*8(%rsi), %rsi 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun movq %r11, -1*8(%rdi) 139*4882a593Smuzhiyun movq %r10, -2*8(%rdi) 140*4882a593Smuzhiyun movq %r9, -3*8(%rdi) 141*4882a593Smuzhiyun movq %r8, -4*8(%rdi) 142*4882a593Smuzhiyun leaq -4*8(%rdi), %rdi 143*4882a593Smuzhiyun jae 8b 144*4882a593Smuzhiyun /* 145*4882a593Smuzhiyun * Calculate copy position to head. 146*4882a593Smuzhiyun */ 147*4882a593Smuzhiyun addq $0x20, %rdx 148*4882a593Smuzhiyun subq %rdx, %rsi 149*4882a593Smuzhiyun subq %rdx, %rdi 150*4882a593Smuzhiyun1: 151*4882a593Smuzhiyun cmpq $16, %rdx 152*4882a593Smuzhiyun jb 9f 153*4882a593Smuzhiyun /* 154*4882a593Smuzhiyun * Move data from 16 bytes to 31 bytes. 155*4882a593Smuzhiyun */ 156*4882a593Smuzhiyun movq 0*8(%rsi), %r11 157*4882a593Smuzhiyun movq 1*8(%rsi), %r10 158*4882a593Smuzhiyun movq -2*8(%rsi, %rdx), %r9 159*4882a593Smuzhiyun movq -1*8(%rsi, %rdx), %r8 160*4882a593Smuzhiyun movq %r11, 0*8(%rdi) 161*4882a593Smuzhiyun movq %r10, 1*8(%rdi) 162*4882a593Smuzhiyun movq %r9, -2*8(%rdi, %rdx) 163*4882a593Smuzhiyun movq %r8, -1*8(%rdi, %rdx) 164*4882a593Smuzhiyun jmp 13f 165*4882a593Smuzhiyun .p2align 4 166*4882a593Smuzhiyun9: 167*4882a593Smuzhiyun cmpq $8, %rdx 168*4882a593Smuzhiyun jb 10f 169*4882a593Smuzhiyun /* 170*4882a593Smuzhiyun * Move data from 8 bytes to 15 bytes. 171*4882a593Smuzhiyun */ 172*4882a593Smuzhiyun movq 0*8(%rsi), %r11 173*4882a593Smuzhiyun movq -1*8(%rsi, %rdx), %r10 174*4882a593Smuzhiyun movq %r11, 0*8(%rdi) 175*4882a593Smuzhiyun movq %r10, -1*8(%rdi, %rdx) 176*4882a593Smuzhiyun jmp 13f 177*4882a593Smuzhiyun10: 178*4882a593Smuzhiyun cmpq $4, %rdx 179*4882a593Smuzhiyun jb 11f 180*4882a593Smuzhiyun /* 181*4882a593Smuzhiyun * Move data from 4 bytes to 7 bytes. 182*4882a593Smuzhiyun */ 183*4882a593Smuzhiyun movl (%rsi), %r11d 184*4882a593Smuzhiyun movl -4(%rsi, %rdx), %r10d 185*4882a593Smuzhiyun movl %r11d, (%rdi) 186*4882a593Smuzhiyun movl %r10d, -4(%rdi, %rdx) 187*4882a593Smuzhiyun jmp 13f 188*4882a593Smuzhiyun11: 189*4882a593Smuzhiyun cmp $2, %rdx 190*4882a593Smuzhiyun jb 12f 191*4882a593Smuzhiyun /* 192*4882a593Smuzhiyun * Move data from 2 bytes to 3 bytes. 193*4882a593Smuzhiyun */ 194*4882a593Smuzhiyun movw (%rsi), %r11w 195*4882a593Smuzhiyun movw -2(%rsi, %rdx), %r10w 196*4882a593Smuzhiyun movw %r11w, (%rdi) 197*4882a593Smuzhiyun movw %r10w, -2(%rdi, %rdx) 198*4882a593Smuzhiyun jmp 13f 199*4882a593Smuzhiyun12: 200*4882a593Smuzhiyun cmp $1, %rdx 201*4882a593Smuzhiyun jb 13f 202*4882a593Smuzhiyun /* 203*4882a593Smuzhiyun * Move data for 1 byte. 204*4882a593Smuzhiyun */ 205*4882a593Smuzhiyun movb (%rsi), %r11b 206*4882a593Smuzhiyun movb %r11b, (%rdi) 207*4882a593Smuzhiyun13: 208*4882a593Smuzhiyun RET 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun.Lmemmove_erms: 211*4882a593Smuzhiyun movq %rdx, %rcx 212*4882a593Smuzhiyun rep movsb 213*4882a593Smuzhiyun RET 214*4882a593SmuzhiyunSYM_FUNC_END(__memmove) 215*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(memmove) 216*4882a593SmuzhiyunEXPORT_SYMBOL(__memmove) 217*4882a593SmuzhiyunEXPORT_SYMBOL(memmove) 218