1*4882a593Smuzhiyun/* 2*4882a593Smuzhiyun * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions 3*4882a593Smuzhiyun * xthal_memcpy and xthal_bcopy 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * This file is subject to the terms and conditions of the GNU General Public 6*4882a593Smuzhiyun * License. See the file "COPYING" in the main directory of this archive 7*4882a593Smuzhiyun * for more details. 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * Copyright (C) 2002 - 2012 Tensilica Inc. 10*4882a593Smuzhiyun */ 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun#include <linux/linkage.h> 13*4882a593Smuzhiyun#include <asm/asmmacro.h> 14*4882a593Smuzhiyun#include <asm/core.h> 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun/* 17*4882a593Smuzhiyun * void *memcpy(void *dst, const void *src, size_t len); 18*4882a593Smuzhiyun * 19*4882a593Smuzhiyun * This function is intended to do the same thing as the standard 20*4882a593Smuzhiyun * library function memcpy() for most cases. 21*4882a593Smuzhiyun * However, where the source and/or destination references 22*4882a593Smuzhiyun * an instruction RAM or ROM or a data RAM or ROM, that 23*4882a593Smuzhiyun * source and/or destination will always be accessed with 24*4882a593Smuzhiyun * 32-bit load and store instructions (as required for these 25*4882a593Smuzhiyun * types of devices). 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * !!!!!!! XTFIXME: 28*4882a593Smuzhiyun * !!!!!!! Handling of IRAM/IROM has not yet 29*4882a593Smuzhiyun * !!!!!!! been implemented. 30*4882a593Smuzhiyun * 31*4882a593Smuzhiyun * The (general case) algorithm is as follows: 32*4882a593Smuzhiyun * If destination is unaligned, align it by conditionally 33*4882a593Smuzhiyun * copying 1 and 2 bytes. 34*4882a593Smuzhiyun * If source is aligned, 35*4882a593Smuzhiyun * do 16 bytes with a loop, and then finish up with 36*4882a593Smuzhiyun * 8, 4, 2, and 1 byte copies conditional on the length; 37*4882a593Smuzhiyun * else (if source is unaligned), 38*4882a593Smuzhiyun * do the same, but use SRC to align the source data. 39*4882a593Smuzhiyun * This code tries to use fall-through branches for the common 40*4882a593Smuzhiyun * case of aligned source and destination and multiple 41*4882a593Smuzhiyun * of 4 (or 8) length. 42*4882a593Smuzhiyun * 43*4882a593Smuzhiyun * Register use: 44*4882a593Smuzhiyun * a0/ return address 45*4882a593Smuzhiyun * a1/ stack pointer 46*4882a593Smuzhiyun * a2/ return value 47*4882a593Smuzhiyun * a3/ src 48*4882a593Smuzhiyun * a4/ length 49*4882a593Smuzhiyun * a5/ dst 50*4882a593Smuzhiyun * a6/ tmp 51*4882a593Smuzhiyun * a7/ tmp 52*4882a593Smuzhiyun * a8/ tmp 53*4882a593Smuzhiyun * a9/ tmp 54*4882a593Smuzhiyun * a10/ tmp 55*4882a593Smuzhiyun * a11/ tmp 56*4882a593Smuzhiyun */ 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun .text 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun/* 61*4882a593Smuzhiyun * Byte by byte copy 62*4882a593Smuzhiyun */ 63*4882a593Smuzhiyun .align 4 64*4882a593Smuzhiyun .byte 0 # 1 mod 4 alignment for LOOPNEZ 65*4882a593Smuzhiyun # (0 mod 4 alignment for LBEG) 66*4882a593Smuzhiyun.Lbytecopy: 67*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS 68*4882a593Smuzhiyun loopnez a4, .Lbytecopydone 69*4882a593Smuzhiyun#else /* !XCHAL_HAVE_LOOPS */ 70*4882a593Smuzhiyun beqz a4, .Lbytecopydone 71*4882a593Smuzhiyun add a7, a3, a4 # a7 = end address for source 72*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 73*4882a593Smuzhiyun.Lnextbyte: 74*4882a593Smuzhiyun l8ui a6, a3, 0 75*4882a593Smuzhiyun addi a3, a3, 1 76*4882a593Smuzhiyun s8i a6, a5, 0 77*4882a593Smuzhiyun addi a5, a5, 1 78*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS 79*4882a593Smuzhiyun bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end 80*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 81*4882a593Smuzhiyun.Lbytecopydone: 82*4882a593Smuzhiyun abi_ret_default 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun/* 85*4882a593Smuzhiyun * Destination is unaligned 86*4882a593Smuzhiyun */ 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun .align 4 89*4882a593Smuzhiyun.Ldst1mod2: # dst is only byte aligned 90*4882a593Smuzhiyun _bltui a4, 7, .Lbytecopy # do short copies byte by byte 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun # copy 1 byte 93*4882a593Smuzhiyun l8ui a6, a3, 0 94*4882a593Smuzhiyun addi a3, a3, 1 95*4882a593Smuzhiyun addi a4, a4, -1 96*4882a593Smuzhiyun s8i a6, a5, 0 97*4882a593Smuzhiyun addi a5, a5, 1 98*4882a593Smuzhiyun _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then 99*4882a593Smuzhiyun # return to main algorithm 100*4882a593Smuzhiyun.Ldst2mod4: # dst 16-bit aligned 101*4882a593Smuzhiyun # copy 2 bytes 102*4882a593Smuzhiyun _bltui a4, 6, .Lbytecopy # do short copies byte by byte 103*4882a593Smuzhiyun l8ui a6, a3, 0 104*4882a593Smuzhiyun l8ui a7, a3, 1 105*4882a593Smuzhiyun addi a3, a3, 2 106*4882a593Smuzhiyun addi a4, a4, -2 107*4882a593Smuzhiyun s8i a6, a5, 0 108*4882a593Smuzhiyun s8i a7, a5, 1 109*4882a593Smuzhiyun addi a5, a5, 2 110*4882a593Smuzhiyun j .Ldstaligned # dst is now aligned, return to main algorithm 111*4882a593Smuzhiyun 112*4882a593SmuzhiyunENTRY(__memcpy) 113*4882a593SmuzhiyunWEAK(memcpy) 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun abi_entry_default 116*4882a593Smuzhiyun # a2/ dst, a3/ src, a4/ len 117*4882a593Smuzhiyun mov a5, a2 # copy dst so that a2 is return value 118*4882a593Smuzhiyun.Lcommon: 119*4882a593Smuzhiyun _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 120*4882a593Smuzhiyun _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 121*4882a593Smuzhiyun.Ldstaligned: # return here from .Ldst?mod? once dst is aligned 122*4882a593Smuzhiyun srli a7, a4, 4 # number of loop iterations with 16B 123*4882a593Smuzhiyun # per iteration 124*4882a593Smuzhiyun movi a8, 3 # if source is not aligned, 125*4882a593Smuzhiyun _bany a3, a8, .Lsrcunaligned # then use shifting copy 126*4882a593Smuzhiyun /* 127*4882a593Smuzhiyun * Destination and source are word-aligned, use word copy. 128*4882a593Smuzhiyun */ 129*4882a593Smuzhiyun # copy 16 bytes per iteration for word-aligned dst and word-aligned src 130*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS 131*4882a593Smuzhiyun loopnez a7, .Loop1done 132*4882a593Smuzhiyun#else /* !XCHAL_HAVE_LOOPS */ 133*4882a593Smuzhiyun beqz a7, .Loop1done 134*4882a593Smuzhiyun slli a8, a7, 4 135*4882a593Smuzhiyun add a8, a8, a3 # a8 = end of last 16B source chunk 136*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 137*4882a593Smuzhiyun.Loop1: 138*4882a593Smuzhiyun l32i a6, a3, 0 139*4882a593Smuzhiyun l32i a7, a3, 4 140*4882a593Smuzhiyun s32i a6, a5, 0 141*4882a593Smuzhiyun l32i a6, a3, 8 142*4882a593Smuzhiyun s32i a7, a5, 4 143*4882a593Smuzhiyun l32i a7, a3, 12 144*4882a593Smuzhiyun s32i a6, a5, 8 145*4882a593Smuzhiyun addi a3, a3, 16 146*4882a593Smuzhiyun s32i a7, a5, 12 147*4882a593Smuzhiyun addi a5, a5, 16 148*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS 149*4882a593Smuzhiyun bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end 150*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 151*4882a593Smuzhiyun.Loop1done: 152*4882a593Smuzhiyun bbci.l a4, 3, .L2 153*4882a593Smuzhiyun # copy 8 bytes 154*4882a593Smuzhiyun l32i a6, a3, 0 155*4882a593Smuzhiyun l32i a7, a3, 4 156*4882a593Smuzhiyun addi a3, a3, 8 157*4882a593Smuzhiyun s32i a6, a5, 0 158*4882a593Smuzhiyun s32i a7, a5, 4 159*4882a593Smuzhiyun addi a5, a5, 8 160*4882a593Smuzhiyun.L2: 161*4882a593Smuzhiyun bbsi.l a4, 2, .L3 162*4882a593Smuzhiyun bbsi.l a4, 1, .L4 163*4882a593Smuzhiyun bbsi.l a4, 0, .L5 164*4882a593Smuzhiyun abi_ret_default 165*4882a593Smuzhiyun.L3: 166*4882a593Smuzhiyun # copy 4 bytes 167*4882a593Smuzhiyun l32i a6, a3, 0 168*4882a593Smuzhiyun addi a3, a3, 4 169*4882a593Smuzhiyun s32i a6, a5, 0 170*4882a593Smuzhiyun addi a5, a5, 4 171*4882a593Smuzhiyun bbsi.l a4, 1, .L4 172*4882a593Smuzhiyun bbsi.l a4, 0, .L5 173*4882a593Smuzhiyun abi_ret_default 174*4882a593Smuzhiyun.L4: 175*4882a593Smuzhiyun # copy 2 bytes 176*4882a593Smuzhiyun l16ui a6, a3, 0 177*4882a593Smuzhiyun addi a3, a3, 2 178*4882a593Smuzhiyun s16i a6, a5, 0 179*4882a593Smuzhiyun addi a5, a5, 2 180*4882a593Smuzhiyun bbsi.l a4, 0, .L5 181*4882a593Smuzhiyun abi_ret_default 182*4882a593Smuzhiyun.L5: 183*4882a593Smuzhiyun # copy 1 byte 184*4882a593Smuzhiyun l8ui a6, a3, 0 185*4882a593Smuzhiyun s8i a6, a5, 0 186*4882a593Smuzhiyun abi_ret_default 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun/* 189*4882a593Smuzhiyun * Destination is aligned, Source is unaligned 190*4882a593Smuzhiyun */ 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun .align 4 193*4882a593Smuzhiyun.Lsrcunaligned: 194*4882a593Smuzhiyun _beqz a4, .Ldone # avoid loading anything for zero-length copies 195*4882a593Smuzhiyun # copy 16 bytes per iteration for word-aligned dst and unaligned src 196*4882a593Smuzhiyun __ssa8 a3 # set shift amount from byte offset 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun/* set to 1 when running on ISS (simulator) with the 199*4882a593Smuzhiyun lint or ferret client, or 0 to save a few cycles */ 200*4882a593Smuzhiyun#define SIM_CHECKS_ALIGNMENT 1 201*4882a593Smuzhiyun#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 202*4882a593Smuzhiyun and a11, a3, a8 # save unalignment offset for below 203*4882a593Smuzhiyun sub a3, a3, a11 # align a3 204*4882a593Smuzhiyun#endif 205*4882a593Smuzhiyun l32i a6, a3, 0 # load first word 206*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS 207*4882a593Smuzhiyun loopnez a7, .Loop2done 208*4882a593Smuzhiyun#else /* !XCHAL_HAVE_LOOPS */ 209*4882a593Smuzhiyun beqz a7, .Loop2done 210*4882a593Smuzhiyun slli a10, a7, 4 211*4882a593Smuzhiyun add a10, a10, a3 # a10 = end of last 16B source chunk 212*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 213*4882a593Smuzhiyun.Loop2: 214*4882a593Smuzhiyun l32i a7, a3, 4 215*4882a593Smuzhiyun l32i a8, a3, 8 216*4882a593Smuzhiyun __src_b a6, a6, a7 217*4882a593Smuzhiyun s32i a6, a5, 0 218*4882a593Smuzhiyun l32i a9, a3, 12 219*4882a593Smuzhiyun __src_b a7, a7, a8 220*4882a593Smuzhiyun s32i a7, a5, 4 221*4882a593Smuzhiyun l32i a6, a3, 16 222*4882a593Smuzhiyun __src_b a8, a8, a9 223*4882a593Smuzhiyun s32i a8, a5, 8 224*4882a593Smuzhiyun addi a3, a3, 16 225*4882a593Smuzhiyun __src_b a9, a9, a6 226*4882a593Smuzhiyun s32i a9, a5, 12 227*4882a593Smuzhiyun addi a5, a5, 16 228*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS 229*4882a593Smuzhiyun bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end 230*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 231*4882a593Smuzhiyun.Loop2done: 232*4882a593Smuzhiyun bbci.l a4, 3, .L12 233*4882a593Smuzhiyun # copy 8 bytes 234*4882a593Smuzhiyun l32i a7, a3, 4 235*4882a593Smuzhiyun l32i a8, a3, 8 236*4882a593Smuzhiyun __src_b a6, a6, a7 237*4882a593Smuzhiyun s32i a6, a5, 0 238*4882a593Smuzhiyun addi a3, a3, 8 239*4882a593Smuzhiyun __src_b a7, a7, a8 240*4882a593Smuzhiyun s32i a7, a5, 4 241*4882a593Smuzhiyun addi a5, a5, 8 242*4882a593Smuzhiyun mov a6, a8 243*4882a593Smuzhiyun.L12: 244*4882a593Smuzhiyun bbci.l a4, 2, .L13 245*4882a593Smuzhiyun # copy 4 bytes 246*4882a593Smuzhiyun l32i a7, a3, 4 247*4882a593Smuzhiyun addi a3, a3, 4 248*4882a593Smuzhiyun __src_b a6, a6, a7 249*4882a593Smuzhiyun s32i a6, a5, 0 250*4882a593Smuzhiyun addi a5, a5, 4 251*4882a593Smuzhiyun mov a6, a7 252*4882a593Smuzhiyun.L13: 253*4882a593Smuzhiyun#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 254*4882a593Smuzhiyun add a3, a3, a11 # readjust a3 with correct misalignment 255*4882a593Smuzhiyun#endif 256*4882a593Smuzhiyun bbsi.l a4, 1, .L14 257*4882a593Smuzhiyun bbsi.l a4, 0, .L15 258*4882a593Smuzhiyun.Ldone: abi_ret_default 259*4882a593Smuzhiyun.L14: 260*4882a593Smuzhiyun # copy 2 bytes 261*4882a593Smuzhiyun l8ui a6, a3, 0 262*4882a593Smuzhiyun l8ui a7, a3, 1 263*4882a593Smuzhiyun addi a3, a3, 2 264*4882a593Smuzhiyun s8i a6, a5, 0 265*4882a593Smuzhiyun s8i a7, a5, 1 266*4882a593Smuzhiyun addi a5, a5, 2 267*4882a593Smuzhiyun bbsi.l a4, 0, .L15 268*4882a593Smuzhiyun abi_ret_default 269*4882a593Smuzhiyun.L15: 270*4882a593Smuzhiyun # copy 1 byte 271*4882a593Smuzhiyun l8ui a6, a3, 0 272*4882a593Smuzhiyun s8i a6, a5, 0 273*4882a593Smuzhiyun abi_ret_default 274*4882a593Smuzhiyun 275*4882a593SmuzhiyunENDPROC(__memcpy) 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun/* 278*4882a593Smuzhiyun * void bcopy(const void *src, void *dest, size_t n); 279*4882a593Smuzhiyun */ 280*4882a593Smuzhiyun 281*4882a593SmuzhiyunENTRY(bcopy) 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun abi_entry_default 284*4882a593Smuzhiyun # a2=src, a3=dst, a4=len 285*4882a593Smuzhiyun mov a5, a3 286*4882a593Smuzhiyun mov a3, a2 287*4882a593Smuzhiyun mov a2, a5 288*4882a593Smuzhiyun j .Lmovecommon # go to common code for memmove+bcopy 289*4882a593Smuzhiyun 290*4882a593SmuzhiyunENDPROC(bcopy) 291*4882a593Smuzhiyun 292*4882a593Smuzhiyun/* 293*4882a593Smuzhiyun * void *memmove(void *dst, const void *src, size_t len); 294*4882a593Smuzhiyun * 295*4882a593Smuzhiyun * This function is intended to do the same thing as the standard 296*4882a593Smuzhiyun * library function memmove() for most cases. 297*4882a593Smuzhiyun * However, where the source and/or destination references 298*4882a593Smuzhiyun * an instruction RAM or ROM or a data RAM or ROM, that 299*4882a593Smuzhiyun * source and/or destination will always be accessed with 300*4882a593Smuzhiyun * 32-bit load and store instructions (as required for these 301*4882a593Smuzhiyun * types of devices). 302*4882a593Smuzhiyun * 303*4882a593Smuzhiyun * !!!!!!! XTFIXME: 304*4882a593Smuzhiyun * !!!!!!! Handling of IRAM/IROM has not yet 305*4882a593Smuzhiyun * !!!!!!! been implemented. 306*4882a593Smuzhiyun * 307*4882a593Smuzhiyun * The (general case) algorithm is as follows: 308*4882a593Smuzhiyun * If end of source doesn't overlap destination then use memcpy. 309*4882a593Smuzhiyun * Otherwise do memcpy backwards. 310*4882a593Smuzhiyun * 311*4882a593Smuzhiyun * Register use: 312*4882a593Smuzhiyun * a0/ return address 313*4882a593Smuzhiyun * a1/ stack pointer 314*4882a593Smuzhiyun * a2/ return value 315*4882a593Smuzhiyun * a3/ src 316*4882a593Smuzhiyun * a4/ length 317*4882a593Smuzhiyun * a5/ dst 318*4882a593Smuzhiyun * a6/ tmp 319*4882a593Smuzhiyun * a7/ tmp 320*4882a593Smuzhiyun * a8/ tmp 321*4882a593Smuzhiyun * a9/ tmp 322*4882a593Smuzhiyun * a10/ tmp 323*4882a593Smuzhiyun * a11/ tmp 324*4882a593Smuzhiyun */ 325*4882a593Smuzhiyun 326*4882a593Smuzhiyun/* 327*4882a593Smuzhiyun * Byte by byte copy 328*4882a593Smuzhiyun */ 329*4882a593Smuzhiyun .align 4 330*4882a593Smuzhiyun .byte 0 # 1 mod 4 alignment for LOOPNEZ 331*4882a593Smuzhiyun # (0 mod 4 alignment for LBEG) 332*4882a593Smuzhiyun.Lbackbytecopy: 333*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS 334*4882a593Smuzhiyun loopnez a4, .Lbackbytecopydone 335*4882a593Smuzhiyun#else /* !XCHAL_HAVE_LOOPS */ 336*4882a593Smuzhiyun beqz a4, .Lbackbytecopydone 337*4882a593Smuzhiyun sub a7, a3, a4 # a7 = start address for source 338*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 339*4882a593Smuzhiyun.Lbacknextbyte: 340*4882a593Smuzhiyun addi a3, a3, -1 341*4882a593Smuzhiyun l8ui a6, a3, 0 342*4882a593Smuzhiyun addi a5, a5, -1 343*4882a593Smuzhiyun s8i a6, a5, 0 344*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS 345*4882a593Smuzhiyun bne a3, a7, .Lbacknextbyte # continue loop if 346*4882a593Smuzhiyun # $a3:src != $a7:src_start 347*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 348*4882a593Smuzhiyun.Lbackbytecopydone: 349*4882a593Smuzhiyun abi_ret_default 350*4882a593Smuzhiyun 351*4882a593Smuzhiyun/* 352*4882a593Smuzhiyun * Destination is unaligned 353*4882a593Smuzhiyun */ 354*4882a593Smuzhiyun 355*4882a593Smuzhiyun .align 4 356*4882a593Smuzhiyun.Lbackdst1mod2: # dst is only byte aligned 357*4882a593Smuzhiyun _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun # copy 1 byte 360*4882a593Smuzhiyun addi a3, a3, -1 361*4882a593Smuzhiyun l8ui a6, a3, 0 362*4882a593Smuzhiyun addi a5, a5, -1 363*4882a593Smuzhiyun s8i a6, a5, 0 364*4882a593Smuzhiyun addi a4, a4, -1 365*4882a593Smuzhiyun _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then 366*4882a593Smuzhiyun # return to main algorithm 367*4882a593Smuzhiyun.Lbackdst2mod4: # dst 16-bit aligned 368*4882a593Smuzhiyun # copy 2 bytes 369*4882a593Smuzhiyun _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte 370*4882a593Smuzhiyun addi a3, a3, -2 371*4882a593Smuzhiyun l8ui a6, a3, 0 372*4882a593Smuzhiyun l8ui a7, a3, 1 373*4882a593Smuzhiyun addi a5, a5, -2 374*4882a593Smuzhiyun s8i a6, a5, 0 375*4882a593Smuzhiyun s8i a7, a5, 1 376*4882a593Smuzhiyun addi a4, a4, -2 377*4882a593Smuzhiyun j .Lbackdstaligned # dst is now aligned, 378*4882a593Smuzhiyun # return to main algorithm 379*4882a593Smuzhiyun 380*4882a593SmuzhiyunENTRY(__memmove) 381*4882a593SmuzhiyunWEAK(memmove) 382*4882a593Smuzhiyun 383*4882a593Smuzhiyun abi_entry_default 384*4882a593Smuzhiyun # a2/ dst, a3/ src, a4/ len 385*4882a593Smuzhiyun mov a5, a2 # copy dst so that a2 is return value 386*4882a593Smuzhiyun.Lmovecommon: 387*4882a593Smuzhiyun sub a6, a5, a3 388*4882a593Smuzhiyun bgeu a6, a4, .Lcommon 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun add a5, a5, a4 391*4882a593Smuzhiyun add a3, a3, a4 392*4882a593Smuzhiyun 393*4882a593Smuzhiyun _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 394*4882a593Smuzhiyun _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 395*4882a593Smuzhiyun.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned 396*4882a593Smuzhiyun srli a7, a4, 4 # number of loop iterations with 16B 397*4882a593Smuzhiyun # per iteration 398*4882a593Smuzhiyun movi a8, 3 # if source is not aligned, 399*4882a593Smuzhiyun _bany a3, a8, .Lbacksrcunaligned # then use shifting copy 400*4882a593Smuzhiyun /* 401*4882a593Smuzhiyun * Destination and source are word-aligned, use word copy. 402*4882a593Smuzhiyun */ 403*4882a593Smuzhiyun # copy 16 bytes per iteration for word-aligned dst and word-aligned src 404*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS 405*4882a593Smuzhiyun loopnez a7, .backLoop1done 406*4882a593Smuzhiyun#else /* !XCHAL_HAVE_LOOPS */ 407*4882a593Smuzhiyun beqz a7, .backLoop1done 408*4882a593Smuzhiyun slli a8, a7, 4 409*4882a593Smuzhiyun sub a8, a3, a8 # a8 = start of first 16B source chunk 410*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 411*4882a593Smuzhiyun.backLoop1: 412*4882a593Smuzhiyun addi a3, a3, -16 413*4882a593Smuzhiyun l32i a7, a3, 12 414*4882a593Smuzhiyun l32i a6, a3, 8 415*4882a593Smuzhiyun addi a5, a5, -16 416*4882a593Smuzhiyun s32i a7, a5, 12 417*4882a593Smuzhiyun l32i a7, a3, 4 418*4882a593Smuzhiyun s32i a6, a5, 8 419*4882a593Smuzhiyun l32i a6, a3, 0 420*4882a593Smuzhiyun s32i a7, a5, 4 421*4882a593Smuzhiyun s32i a6, a5, 0 422*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS 423*4882a593Smuzhiyun bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start 424*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 425*4882a593Smuzhiyun.backLoop1done: 426*4882a593Smuzhiyun bbci.l a4, 3, .Lback2 427*4882a593Smuzhiyun # copy 8 bytes 428*4882a593Smuzhiyun addi a3, a3, -8 429*4882a593Smuzhiyun l32i a6, a3, 0 430*4882a593Smuzhiyun l32i a7, a3, 4 431*4882a593Smuzhiyun addi a5, a5, -8 432*4882a593Smuzhiyun s32i a6, a5, 0 433*4882a593Smuzhiyun s32i a7, a5, 4 434*4882a593Smuzhiyun.Lback2: 435*4882a593Smuzhiyun bbsi.l a4, 2, .Lback3 436*4882a593Smuzhiyun bbsi.l a4, 1, .Lback4 437*4882a593Smuzhiyun bbsi.l a4, 0, .Lback5 438*4882a593Smuzhiyun abi_ret_default 439*4882a593Smuzhiyun.Lback3: 440*4882a593Smuzhiyun # copy 4 bytes 441*4882a593Smuzhiyun addi a3, a3, -4 442*4882a593Smuzhiyun l32i a6, a3, 0 443*4882a593Smuzhiyun addi a5, a5, -4 444*4882a593Smuzhiyun s32i a6, a5, 0 445*4882a593Smuzhiyun bbsi.l a4, 1, .Lback4 446*4882a593Smuzhiyun bbsi.l a4, 0, .Lback5 447*4882a593Smuzhiyun abi_ret_default 448*4882a593Smuzhiyun.Lback4: 449*4882a593Smuzhiyun # copy 2 bytes 450*4882a593Smuzhiyun addi a3, a3, -2 451*4882a593Smuzhiyun l16ui a6, a3, 0 452*4882a593Smuzhiyun addi a5, a5, -2 453*4882a593Smuzhiyun s16i a6, a5, 0 454*4882a593Smuzhiyun bbsi.l a4, 0, .Lback5 455*4882a593Smuzhiyun abi_ret_default 456*4882a593Smuzhiyun.Lback5: 457*4882a593Smuzhiyun # copy 1 byte 458*4882a593Smuzhiyun addi a3, a3, -1 459*4882a593Smuzhiyun l8ui a6, a3, 0 460*4882a593Smuzhiyun addi a5, a5, -1 461*4882a593Smuzhiyun s8i a6, a5, 0 462*4882a593Smuzhiyun abi_ret_default 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun/* 465*4882a593Smuzhiyun * Destination is aligned, Source is unaligned 466*4882a593Smuzhiyun */ 467*4882a593Smuzhiyun 468*4882a593Smuzhiyun .align 4 469*4882a593Smuzhiyun.Lbacksrcunaligned: 470*4882a593Smuzhiyun _beqz a4, .Lbackdone # avoid loading anything for zero-length copies 471*4882a593Smuzhiyun # copy 16 bytes per iteration for word-aligned dst and unaligned src 472*4882a593Smuzhiyun __ssa8 a3 # set shift amount from byte offset 473*4882a593Smuzhiyun#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with 474*4882a593Smuzhiyun * the lint or ferret client, or 0 475*4882a593Smuzhiyun * to save a few cycles */ 476*4882a593Smuzhiyun#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 477*4882a593Smuzhiyun and a11, a3, a8 # save unalignment offset for below 478*4882a593Smuzhiyun sub a3, a3, a11 # align a3 479*4882a593Smuzhiyun#endif 480*4882a593Smuzhiyun l32i a6, a3, 0 # load first word 481*4882a593Smuzhiyun#if XCHAL_HAVE_LOOPS 482*4882a593Smuzhiyun loopnez a7, .backLoop2done 483*4882a593Smuzhiyun#else /* !XCHAL_HAVE_LOOPS */ 484*4882a593Smuzhiyun beqz a7, .backLoop2done 485*4882a593Smuzhiyun slli a10, a7, 4 486*4882a593Smuzhiyun sub a10, a3, a10 # a10 = start of first 16B source chunk 487*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 488*4882a593Smuzhiyun.backLoop2: 489*4882a593Smuzhiyun addi a3, a3, -16 490*4882a593Smuzhiyun l32i a7, a3, 12 491*4882a593Smuzhiyun l32i a8, a3, 8 492*4882a593Smuzhiyun addi a5, a5, -16 493*4882a593Smuzhiyun __src_b a6, a7, a6 494*4882a593Smuzhiyun s32i a6, a5, 12 495*4882a593Smuzhiyun l32i a9, a3, 4 496*4882a593Smuzhiyun __src_b a7, a8, a7 497*4882a593Smuzhiyun s32i a7, a5, 8 498*4882a593Smuzhiyun l32i a6, a3, 0 499*4882a593Smuzhiyun __src_b a8, a9, a8 500*4882a593Smuzhiyun s32i a8, a5, 4 501*4882a593Smuzhiyun __src_b a9, a6, a9 502*4882a593Smuzhiyun s32i a9, a5, 0 503*4882a593Smuzhiyun#if !XCHAL_HAVE_LOOPS 504*4882a593Smuzhiyun bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start 505*4882a593Smuzhiyun#endif /* !XCHAL_HAVE_LOOPS */ 506*4882a593Smuzhiyun.backLoop2done: 507*4882a593Smuzhiyun bbci.l a4, 3, .Lback12 508*4882a593Smuzhiyun # copy 8 bytes 509*4882a593Smuzhiyun addi a3, a3, -8 510*4882a593Smuzhiyun l32i a7, a3, 4 511*4882a593Smuzhiyun l32i a8, a3, 0 512*4882a593Smuzhiyun addi a5, a5, -8 513*4882a593Smuzhiyun __src_b a6, a7, a6 514*4882a593Smuzhiyun s32i a6, a5, 4 515*4882a593Smuzhiyun __src_b a7, a8, a7 516*4882a593Smuzhiyun s32i a7, a5, 0 517*4882a593Smuzhiyun mov a6, a8 518*4882a593Smuzhiyun.Lback12: 519*4882a593Smuzhiyun bbci.l a4, 2, .Lback13 520*4882a593Smuzhiyun # copy 4 bytes 521*4882a593Smuzhiyun addi a3, a3, -4 522*4882a593Smuzhiyun l32i a7, a3, 0 523*4882a593Smuzhiyun addi a5, a5, -4 524*4882a593Smuzhiyun __src_b a6, a7, a6 525*4882a593Smuzhiyun s32i a6, a5, 0 526*4882a593Smuzhiyun mov a6, a7 527*4882a593Smuzhiyun.Lback13: 528*4882a593Smuzhiyun#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 529*4882a593Smuzhiyun add a3, a3, a11 # readjust a3 with correct misalignment 530*4882a593Smuzhiyun#endif 531*4882a593Smuzhiyun bbsi.l a4, 1, .Lback14 532*4882a593Smuzhiyun bbsi.l a4, 0, .Lback15 533*4882a593Smuzhiyun.Lbackdone: 534*4882a593Smuzhiyun abi_ret_default 535*4882a593Smuzhiyun.Lback14: 536*4882a593Smuzhiyun # copy 2 bytes 537*4882a593Smuzhiyun addi a3, a3, -2 538*4882a593Smuzhiyun l8ui a6, a3, 0 539*4882a593Smuzhiyun l8ui a7, a3, 1 540*4882a593Smuzhiyun addi a5, a5, -2 541*4882a593Smuzhiyun s8i a6, a5, 0 542*4882a593Smuzhiyun s8i a7, a5, 1 543*4882a593Smuzhiyun bbsi.l a4, 0, .Lback15 544*4882a593Smuzhiyun abi_ret_default 545*4882a593Smuzhiyun.Lback15: 546*4882a593Smuzhiyun # copy 1 byte 547*4882a593Smuzhiyun addi a3, a3, -1 548*4882a593Smuzhiyun addi a5, a5, -1 549*4882a593Smuzhiyun l8ui a6, a3, 0 550*4882a593Smuzhiyun s8i a6, a5, 0 551*4882a593Smuzhiyun abi_ret_default 552*4882a593Smuzhiyun 553*4882a593SmuzhiyunENDPROC(__memmove) 554