1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd. 4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro. 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro 7*4882a593Smuzhiyun * be found @ 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10*4882a593Smuzhiyun * files/head:/src/aarch64/ 11*4882a593Smuzhiyun */ 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun/* 15*4882a593Smuzhiyun * Copy a buffer from src to dest (alignment handled by the hardware) 16*4882a593Smuzhiyun * 17*4882a593Smuzhiyun * Parameters: 18*4882a593Smuzhiyun * x0 - dest 19*4882a593Smuzhiyun * x1 - src 20*4882a593Smuzhiyun * x2 - n 21*4882a593Smuzhiyun * Returns: 22*4882a593Smuzhiyun * x0 - dest 23*4882a593Smuzhiyun */ 24*4882a593Smuzhiyundstin .req x0 25*4882a593Smuzhiyunsrc .req x1 26*4882a593Smuzhiyuncount .req x2 27*4882a593Smuzhiyuntmp1 .req x3 28*4882a593Smuzhiyuntmp1w .req w3 29*4882a593Smuzhiyuntmp2 .req x4 30*4882a593Smuzhiyuntmp2w .req w4 31*4882a593Smuzhiyundst .req x6 32*4882a593Smuzhiyun 33*4882a593SmuzhiyunA_l .req x7 34*4882a593SmuzhiyunA_h .req x8 35*4882a593SmuzhiyunB_l .req x9 36*4882a593SmuzhiyunB_h .req x10 37*4882a593SmuzhiyunC_l .req x11 38*4882a593SmuzhiyunC_h .req x12 39*4882a593SmuzhiyunD_l .req x13 40*4882a593SmuzhiyunD_h .req x14 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun mov dst, dstin 43*4882a593Smuzhiyun cmp count, #16 44*4882a593Smuzhiyun /*When memory length is less than 16, the accessed are not aligned.*/ 45*4882a593Smuzhiyun b.lo .Ltiny15 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun neg tmp2, src 48*4882a593Smuzhiyun ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 49*4882a593Smuzhiyun b.eq .LSrcAligned 50*4882a593Smuzhiyun sub count, count, tmp2 51*4882a593Smuzhiyun /* 52*4882a593Smuzhiyun * Copy the leading memory data from src to dst in an increasing 53*4882a593Smuzhiyun * address order.By this way,the risk of overwriting the source 54*4882a593Smuzhiyun * memory data is eliminated when the distance between src and 55*4882a593Smuzhiyun * dst is less than 16. The memory accesses here are alignment. 56*4882a593Smuzhiyun */ 57*4882a593Smuzhiyun tbz tmp2, #0, 1f 58*4882a593Smuzhiyun ldrb1 tmp1w, src, #1 59*4882a593Smuzhiyun strb1 tmp1w, dst, #1 60*4882a593Smuzhiyun1: 61*4882a593Smuzhiyun tbz tmp2, #1, 2f 62*4882a593Smuzhiyun ldrh1 tmp1w, src, #2 63*4882a593Smuzhiyun strh1 tmp1w, dst, #2 64*4882a593Smuzhiyun2: 65*4882a593Smuzhiyun tbz tmp2, #2, 3f 66*4882a593Smuzhiyun ldr1 tmp1w, src, #4 67*4882a593Smuzhiyun str1 tmp1w, dst, #4 68*4882a593Smuzhiyun3: 69*4882a593Smuzhiyun tbz tmp2, #3, .LSrcAligned 70*4882a593Smuzhiyun ldr1 tmp1, src, #8 71*4882a593Smuzhiyun str1 tmp1, dst, #8 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun.LSrcAligned: 74*4882a593Smuzhiyun cmp count, #64 75*4882a593Smuzhiyun b.ge .Lcpy_over64 76*4882a593Smuzhiyun /* 77*4882a593Smuzhiyun * Deal with small copies quickly by dropping straight into the 78*4882a593Smuzhiyun * exit block. 79*4882a593Smuzhiyun */ 80*4882a593Smuzhiyun.Ltail63: 81*4882a593Smuzhiyun /* 82*4882a593Smuzhiyun * Copy up to 48 bytes of data. At this point we only need the 83*4882a593Smuzhiyun * bottom 6 bits of count to be accurate. 84*4882a593Smuzhiyun */ 85*4882a593Smuzhiyun ands tmp1, count, #0x30 86*4882a593Smuzhiyun b.eq .Ltiny15 87*4882a593Smuzhiyun cmp tmp1w, #0x20 88*4882a593Smuzhiyun b.eq 1f 89*4882a593Smuzhiyun b.lt 2f 90*4882a593Smuzhiyun ldp1 A_l, A_h, src, #16 91*4882a593Smuzhiyun stp1 A_l, A_h, dst, #16 92*4882a593Smuzhiyun1: 93*4882a593Smuzhiyun ldp1 A_l, A_h, src, #16 94*4882a593Smuzhiyun stp1 A_l, A_h, dst, #16 95*4882a593Smuzhiyun2: 96*4882a593Smuzhiyun ldp1 A_l, A_h, src, #16 97*4882a593Smuzhiyun stp1 A_l, A_h, dst, #16 98*4882a593Smuzhiyun.Ltiny15: 99*4882a593Smuzhiyun /* 100*4882a593Smuzhiyun * Prefer to break one ldp/stp into several load/store to access 101*4882a593Smuzhiyun * memory in an increasing address order,rather than to load/store 16 102*4882a593Smuzhiyun * bytes from (src-16) to (dst-16) and to backward the src to aligned 103*4882a593Smuzhiyun * address,which way is used in original cortex memcpy. If keeping 104*4882a593Smuzhiyun * the original memcpy process here, memmove need to satisfy the 105*4882a593Smuzhiyun * precondition that src address is at least 16 bytes bigger than dst 106*4882a593Smuzhiyun * address,otherwise some source data will be overwritten when memove 107*4882a593Smuzhiyun * call memcpy directly. To make memmove simpler and decouple the 108*4882a593Smuzhiyun * memcpy's dependency on memmove, withdrew the original process. 109*4882a593Smuzhiyun */ 110*4882a593Smuzhiyun tbz count, #3, 1f 111*4882a593Smuzhiyun ldr1 tmp1, src, #8 112*4882a593Smuzhiyun str1 tmp1, dst, #8 113*4882a593Smuzhiyun1: 114*4882a593Smuzhiyun tbz count, #2, 2f 115*4882a593Smuzhiyun ldr1 tmp1w, src, #4 116*4882a593Smuzhiyun str1 tmp1w, dst, #4 117*4882a593Smuzhiyun2: 118*4882a593Smuzhiyun tbz count, #1, 3f 119*4882a593Smuzhiyun ldrh1 tmp1w, src, #2 120*4882a593Smuzhiyun strh1 tmp1w, dst, #2 121*4882a593Smuzhiyun3: 122*4882a593Smuzhiyun tbz count, #0, .Lexitfunc 123*4882a593Smuzhiyun ldrb1 tmp1w, src, #1 124*4882a593Smuzhiyun strb1 tmp1w, dst, #1 125*4882a593Smuzhiyun 126*4882a593Smuzhiyun b .Lexitfunc 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun.Lcpy_over64: 129*4882a593Smuzhiyun subs count, count, #128 130*4882a593Smuzhiyun b.ge .Lcpy_body_large 131*4882a593Smuzhiyun /* 132*4882a593Smuzhiyun * Less than 128 bytes to copy, so handle 64 here and then jump 133*4882a593Smuzhiyun * to the tail. 134*4882a593Smuzhiyun */ 135*4882a593Smuzhiyun ldp1 A_l, A_h, src, #16 136*4882a593Smuzhiyun stp1 A_l, A_h, dst, #16 137*4882a593Smuzhiyun ldp1 B_l, B_h, src, #16 138*4882a593Smuzhiyun ldp1 C_l, C_h, src, #16 139*4882a593Smuzhiyun stp1 B_l, B_h, dst, #16 140*4882a593Smuzhiyun stp1 C_l, C_h, dst, #16 141*4882a593Smuzhiyun ldp1 D_l, D_h, src, #16 142*4882a593Smuzhiyun stp1 D_l, D_h, dst, #16 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun tst count, #0x3f 145*4882a593Smuzhiyun b.ne .Ltail63 146*4882a593Smuzhiyun b .Lexitfunc 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun /* 149*4882a593Smuzhiyun * Critical loop. Start at a new cache line boundary. Assuming 150*4882a593Smuzhiyun * 64 bytes per line this ensures the entire loop is in one line. 151*4882a593Smuzhiyun */ 152*4882a593Smuzhiyun .p2align L1_CACHE_SHIFT 153*4882a593Smuzhiyun.Lcpy_body_large: 154*4882a593Smuzhiyun /* pre-get 64 bytes data. */ 155*4882a593Smuzhiyun ldp1 A_l, A_h, src, #16 156*4882a593Smuzhiyun ldp1 B_l, B_h, src, #16 157*4882a593Smuzhiyun ldp1 C_l, C_h, src, #16 158*4882a593Smuzhiyun ldp1 D_l, D_h, src, #16 159*4882a593Smuzhiyun1: 160*4882a593Smuzhiyun /* 161*4882a593Smuzhiyun * interlace the load of next 64 bytes data block with store of the last 162*4882a593Smuzhiyun * loaded 64 bytes data. 163*4882a593Smuzhiyun */ 164*4882a593Smuzhiyun stp1 A_l, A_h, dst, #16 165*4882a593Smuzhiyun ldp1 A_l, A_h, src, #16 166*4882a593Smuzhiyun stp1 B_l, B_h, dst, #16 167*4882a593Smuzhiyun ldp1 B_l, B_h, src, #16 168*4882a593Smuzhiyun stp1 C_l, C_h, dst, #16 169*4882a593Smuzhiyun ldp1 C_l, C_h, src, #16 170*4882a593Smuzhiyun stp1 D_l, D_h, dst, #16 171*4882a593Smuzhiyun ldp1 D_l, D_h, src, #16 172*4882a593Smuzhiyun subs count, count, #64 173*4882a593Smuzhiyun b.ge 1b 174*4882a593Smuzhiyun stp1 A_l, A_h, dst, #16 175*4882a593Smuzhiyun stp1 B_l, B_h, dst, #16 176*4882a593Smuzhiyun stp1 C_l, C_h, dst, #16 177*4882a593Smuzhiyun stp1 D_l, D_h, dst, #16 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun tst count, #0x3f 180*4882a593Smuzhiyun b.ne .Ltail63 181*4882a593Smuzhiyun.Lexitfunc: 182