1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun#include <linux/linkage.h> 5*4882a593Smuzhiyun#include <asm/cpufeatures.h> 6*4882a593Smuzhiyun#include <asm/alternative.h> 7*4882a593Smuzhiyun#include <asm/export.h> 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun/* 10*4882a593Smuzhiyun * Some CPUs run faster using the string copy instructions (sane microcode). 11*4882a593Smuzhiyun * It is also a lot simpler. Use this when possible. But, don't use streaming 12*4882a593Smuzhiyun * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the 13*4882a593Smuzhiyun * prefetch distance based on SMP/UP. 14*4882a593Smuzhiyun */ 15*4882a593Smuzhiyun ALIGN 16*4882a593SmuzhiyunSYM_FUNC_START(copy_page) 17*4882a593Smuzhiyun ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD 18*4882a593Smuzhiyun movl $4096/8, %ecx 19*4882a593Smuzhiyun rep movsq 20*4882a593Smuzhiyun RET 21*4882a593SmuzhiyunSYM_FUNC_END(copy_page) 22*4882a593SmuzhiyunEXPORT_SYMBOL(copy_page) 23*4882a593Smuzhiyun 24*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(copy_page_regs) 25*4882a593Smuzhiyun subq $2*8, %rsp 26*4882a593Smuzhiyun movq %rbx, (%rsp) 27*4882a593Smuzhiyun movq %r12, 1*8(%rsp) 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun movl $(4096/64)-5, %ecx 30*4882a593Smuzhiyun .p2align 4 31*4882a593Smuzhiyun.Loop64: 32*4882a593Smuzhiyun dec %rcx 33*4882a593Smuzhiyun movq 0x8*0(%rsi), %rax 34*4882a593Smuzhiyun movq 0x8*1(%rsi), %rbx 35*4882a593Smuzhiyun movq 0x8*2(%rsi), %rdx 36*4882a593Smuzhiyun movq 0x8*3(%rsi), %r8 37*4882a593Smuzhiyun movq 0x8*4(%rsi), %r9 38*4882a593Smuzhiyun movq 0x8*5(%rsi), %r10 39*4882a593Smuzhiyun movq 0x8*6(%rsi), %r11 40*4882a593Smuzhiyun movq 0x8*7(%rsi), %r12 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun prefetcht0 5*64(%rsi) 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun movq %rax, 0x8*0(%rdi) 45*4882a593Smuzhiyun movq %rbx, 0x8*1(%rdi) 46*4882a593Smuzhiyun movq %rdx, 0x8*2(%rdi) 47*4882a593Smuzhiyun movq %r8, 0x8*3(%rdi) 48*4882a593Smuzhiyun movq %r9, 0x8*4(%rdi) 49*4882a593Smuzhiyun movq %r10, 0x8*5(%rdi) 50*4882a593Smuzhiyun movq %r11, 0x8*6(%rdi) 51*4882a593Smuzhiyun movq %r12, 0x8*7(%rdi) 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun leaq 64 (%rsi), %rsi 54*4882a593Smuzhiyun leaq 64 (%rdi), %rdi 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun jnz .Loop64 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun movl $5, %ecx 59*4882a593Smuzhiyun .p2align 4 60*4882a593Smuzhiyun.Loop2: 61*4882a593Smuzhiyun decl %ecx 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun movq 0x8*0(%rsi), %rax 64*4882a593Smuzhiyun movq 0x8*1(%rsi), %rbx 65*4882a593Smuzhiyun movq 0x8*2(%rsi), %rdx 66*4882a593Smuzhiyun movq 0x8*3(%rsi), %r8 67*4882a593Smuzhiyun movq 0x8*4(%rsi), %r9 68*4882a593Smuzhiyun movq 0x8*5(%rsi), %r10 69*4882a593Smuzhiyun movq 0x8*6(%rsi), %r11 70*4882a593Smuzhiyun movq 0x8*7(%rsi), %r12 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun movq %rax, 0x8*0(%rdi) 73*4882a593Smuzhiyun movq %rbx, 0x8*1(%rdi) 74*4882a593Smuzhiyun movq %rdx, 0x8*2(%rdi) 75*4882a593Smuzhiyun movq %r8, 0x8*3(%rdi) 76*4882a593Smuzhiyun movq %r9, 0x8*4(%rdi) 77*4882a593Smuzhiyun movq %r10, 0x8*5(%rdi) 78*4882a593Smuzhiyun movq %r11, 0x8*6(%rdi) 79*4882a593Smuzhiyun movq %r12, 0x8*7(%rdi) 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun leaq 64(%rdi), %rdi 82*4882a593Smuzhiyun leaq 64(%rsi), %rsi 83*4882a593Smuzhiyun jnz .Loop2 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun movq (%rsp), %rbx 86*4882a593Smuzhiyun movq 1*8(%rsp), %r12 87*4882a593Smuzhiyun addq $2*8, %rsp 88*4882a593Smuzhiyun RET 89*4882a593SmuzhiyunSYM_FUNC_END(copy_page_regs) 90