1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd. 4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro. 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro 7*4882a593Smuzhiyun * be found @ 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10*4882a593Smuzhiyun * files/head:/src/aarch64/ 11*4882a593Smuzhiyun */ 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#include <linux/linkage.h> 14*4882a593Smuzhiyun#include <asm/assembler.h> 15*4882a593Smuzhiyun#include <asm/cache.h> 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun/* 18*4882a593Smuzhiyun * Move a buffer from src to test (alignment handled by the hardware). 19*4882a593Smuzhiyun * If dest <= src, call memcpy, otherwise copy in reverse order. 20*4882a593Smuzhiyun * 21*4882a593Smuzhiyun * Parameters: 22*4882a593Smuzhiyun * x0 - dest 23*4882a593Smuzhiyun * x1 - src 24*4882a593Smuzhiyun * x2 - n 25*4882a593Smuzhiyun * Returns: 26*4882a593Smuzhiyun * x0 - dest 27*4882a593Smuzhiyun */ 28*4882a593Smuzhiyundstin .req x0 29*4882a593Smuzhiyunsrc .req x1 30*4882a593Smuzhiyuncount .req x2 31*4882a593Smuzhiyuntmp1 .req x3 32*4882a593Smuzhiyuntmp1w .req w3 33*4882a593Smuzhiyuntmp2 .req x4 34*4882a593Smuzhiyuntmp2w .req w4 35*4882a593Smuzhiyuntmp3 .req x5 36*4882a593Smuzhiyuntmp3w .req w5 37*4882a593Smuzhiyundst .req x6 38*4882a593Smuzhiyun 39*4882a593SmuzhiyunA_l .req x7 40*4882a593SmuzhiyunA_h .req x8 41*4882a593SmuzhiyunB_l .req x9 42*4882a593SmuzhiyunB_h .req x10 43*4882a593SmuzhiyunC_l .req x11 44*4882a593SmuzhiyunC_h .req x12 45*4882a593SmuzhiyunD_l .req x13 46*4882a593SmuzhiyunD_h .req x14 47*4882a593Smuzhiyun 48*4882a593SmuzhiyunSYM_FUNC_START_ALIAS(__memmove) 49*4882a593SmuzhiyunSYM_FUNC_START_WEAK_PI(memmove) 50*4882a593Smuzhiyun cmp dstin, src 51*4882a593Smuzhiyun b.lo __memcpy 52*4882a593Smuzhiyun add tmp1, src, count 53*4882a593Smuzhiyun cmp dstin, tmp1 54*4882a593Smuzhiyun b.hs __memcpy /* No overlap. */ 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun add dst, dstin, count 57*4882a593Smuzhiyun add src, src, count 58*4882a593Smuzhiyun cmp count, #16 59*4882a593Smuzhiyun b.lo .Ltail15 /*probably non-alignment accesses.*/ 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun ands tmp2, src, #15 /* Bytes to reach alignment. */ 62*4882a593Smuzhiyun b.eq .LSrcAligned 63*4882a593Smuzhiyun sub count, count, tmp2 64*4882a593Smuzhiyun /* 65*4882a593Smuzhiyun * process the aligned offset length to make the src aligned firstly. 66*4882a593Smuzhiyun * those extra instructions' cost is acceptable. It also make the 67*4882a593Smuzhiyun * coming accesses are based on aligned address. 68*4882a593Smuzhiyun */ 69*4882a593Smuzhiyun tbz tmp2, #0, 1f 70*4882a593Smuzhiyun ldrb tmp1w, [src, #-1]! 71*4882a593Smuzhiyun strb tmp1w, [dst, #-1]! 72*4882a593Smuzhiyun1: 73*4882a593Smuzhiyun tbz tmp2, #1, 2f 74*4882a593Smuzhiyun ldrh tmp1w, [src, #-2]! 75*4882a593Smuzhiyun strh tmp1w, [dst, #-2]! 76*4882a593Smuzhiyun2: 77*4882a593Smuzhiyun tbz tmp2, #2, 3f 78*4882a593Smuzhiyun ldr tmp1w, [src, #-4]! 79*4882a593Smuzhiyun str tmp1w, [dst, #-4]! 80*4882a593Smuzhiyun3: 81*4882a593Smuzhiyun tbz tmp2, #3, .LSrcAligned 82*4882a593Smuzhiyun ldr tmp1, [src, #-8]! 83*4882a593Smuzhiyun str tmp1, [dst, #-8]! 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun.LSrcAligned: 86*4882a593Smuzhiyun cmp count, #64 87*4882a593Smuzhiyun b.ge .Lcpy_over64 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun /* 90*4882a593Smuzhiyun * Deal with small copies quickly by dropping straight into the 91*4882a593Smuzhiyun * exit block. 92*4882a593Smuzhiyun */ 93*4882a593Smuzhiyun.Ltail63: 94*4882a593Smuzhiyun /* 95*4882a593Smuzhiyun * Copy up to 48 bytes of data. At this point we only need the 96*4882a593Smuzhiyun * bottom 6 bits of count to be accurate. 97*4882a593Smuzhiyun */ 98*4882a593Smuzhiyun ands tmp1, count, #0x30 99*4882a593Smuzhiyun b.eq .Ltail15 100*4882a593Smuzhiyun cmp tmp1w, #0x20 101*4882a593Smuzhiyun b.eq 1f 102*4882a593Smuzhiyun b.lt 2f 103*4882a593Smuzhiyun ldp A_l, A_h, [src, #-16]! 104*4882a593Smuzhiyun stp A_l, A_h, [dst, #-16]! 105*4882a593Smuzhiyun1: 106*4882a593Smuzhiyun ldp A_l, A_h, [src, #-16]! 107*4882a593Smuzhiyun stp A_l, A_h, [dst, #-16]! 108*4882a593Smuzhiyun2: 109*4882a593Smuzhiyun ldp A_l, A_h, [src, #-16]! 110*4882a593Smuzhiyun stp A_l, A_h, [dst, #-16]! 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun.Ltail15: 113*4882a593Smuzhiyun tbz count, #3, 1f 114*4882a593Smuzhiyun ldr tmp1, [src, #-8]! 115*4882a593Smuzhiyun str tmp1, [dst, #-8]! 116*4882a593Smuzhiyun1: 117*4882a593Smuzhiyun tbz count, #2, 2f 118*4882a593Smuzhiyun ldr tmp1w, [src, #-4]! 119*4882a593Smuzhiyun str tmp1w, [dst, #-4]! 120*4882a593Smuzhiyun2: 121*4882a593Smuzhiyun tbz count, #1, 3f 122*4882a593Smuzhiyun ldrh tmp1w, [src, #-2]! 123*4882a593Smuzhiyun strh tmp1w, [dst, #-2]! 124*4882a593Smuzhiyun3: 125*4882a593Smuzhiyun tbz count, #0, .Lexitfunc 126*4882a593Smuzhiyun ldrb tmp1w, [src, #-1] 127*4882a593Smuzhiyun strb tmp1w, [dst, #-1] 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun.Lexitfunc: 130*4882a593Smuzhiyun ret 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun.Lcpy_over64: 133*4882a593Smuzhiyun subs count, count, #128 134*4882a593Smuzhiyun b.ge .Lcpy_body_large 135*4882a593Smuzhiyun /* 136*4882a593Smuzhiyun * Less than 128 bytes to copy, so handle 64 bytes here and then jump 137*4882a593Smuzhiyun * to the tail. 138*4882a593Smuzhiyun */ 139*4882a593Smuzhiyun ldp A_l, A_h, [src, #-16] 140*4882a593Smuzhiyun stp A_l, A_h, [dst, #-16] 141*4882a593Smuzhiyun ldp B_l, B_h, [src, #-32] 142*4882a593Smuzhiyun ldp C_l, C_h, [src, #-48] 143*4882a593Smuzhiyun stp B_l, B_h, [dst, #-32] 144*4882a593Smuzhiyun stp C_l, C_h, [dst, #-48] 145*4882a593Smuzhiyun ldp D_l, D_h, [src, #-64]! 146*4882a593Smuzhiyun stp D_l, D_h, [dst, #-64]! 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun tst count, #0x3f 149*4882a593Smuzhiyun b.ne .Ltail63 150*4882a593Smuzhiyun ret 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun /* 153*4882a593Smuzhiyun * Critical loop. Start at a new cache line boundary. Assuming 154*4882a593Smuzhiyun * 64 bytes per line this ensures the entire loop is in one line. 155*4882a593Smuzhiyun */ 156*4882a593Smuzhiyun .p2align L1_CACHE_SHIFT 157*4882a593Smuzhiyun.Lcpy_body_large: 158*4882a593Smuzhiyun /* pre-load 64 bytes data. */ 159*4882a593Smuzhiyun ldp A_l, A_h, [src, #-16] 160*4882a593Smuzhiyun ldp B_l, B_h, [src, #-32] 161*4882a593Smuzhiyun ldp C_l, C_h, [src, #-48] 162*4882a593Smuzhiyun ldp D_l, D_h, [src, #-64]! 163*4882a593Smuzhiyun1: 164*4882a593Smuzhiyun /* 165*4882a593Smuzhiyun * interlace the load of next 64 bytes data block with store of the last 166*4882a593Smuzhiyun * loaded 64 bytes data. 167*4882a593Smuzhiyun */ 168*4882a593Smuzhiyun stp A_l, A_h, [dst, #-16] 169*4882a593Smuzhiyun ldp A_l, A_h, [src, #-16] 170*4882a593Smuzhiyun stp B_l, B_h, [dst, #-32] 171*4882a593Smuzhiyun ldp B_l, B_h, [src, #-32] 172*4882a593Smuzhiyun stp C_l, C_h, [dst, #-48] 173*4882a593Smuzhiyun ldp C_l, C_h, [src, #-48] 174*4882a593Smuzhiyun stp D_l, D_h, [dst, #-64]! 175*4882a593Smuzhiyun ldp D_l, D_h, [src, #-64]! 176*4882a593Smuzhiyun subs count, count, #64 177*4882a593Smuzhiyun b.ge 1b 178*4882a593Smuzhiyun stp A_l, A_h, [dst, #-16] 179*4882a593Smuzhiyun stp B_l, B_h, [dst, #-32] 180*4882a593Smuzhiyun stp C_l, C_h, [dst, #-48] 181*4882a593Smuzhiyun stp D_l, D_h, [dst, #-64]! 182*4882a593Smuzhiyun 183*4882a593Smuzhiyun tst count, #0x3f 184*4882a593Smuzhiyun b.ne .Ltail63 185*4882a593Smuzhiyun ret 186*4882a593SmuzhiyunSYM_FUNC_END_PI(memmove) 187*4882a593SmuzhiyunEXPORT_SYMBOL(memmove) 188*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(__memmove) 189*4882a593SmuzhiyunEXPORT_SYMBOL(__memmove) 190