1*4882a593Smuzhiyun/* 2*4882a593Smuzhiyun * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> 3*4882a593Smuzhiyun * Copyright (C) 2008-2009 PetaLogix 4*4882a593Smuzhiyun * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * This file is subject to the terms and conditions of the GNU General 7*4882a593Smuzhiyun * Public License. See the file COPYING in the main directory of this 8*4882a593Smuzhiyun * archive for more details. 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun * Written by Jim Law <jlaw@irispower.com> 11*4882a593Smuzhiyun * 12*4882a593Smuzhiyun * intended to replace: 13*4882a593Smuzhiyun * memcpy in memcpy.c and 14*4882a593Smuzhiyun * memmove in memmove.c 15*4882a593Smuzhiyun * ... in arch/microblaze/lib 16*4882a593Smuzhiyun * 17*4882a593Smuzhiyun * 18*4882a593Smuzhiyun * assly_fastcopy.S 19*4882a593Smuzhiyun * 20*4882a593Smuzhiyun * Attempt at quicker memcpy and memmove for MicroBlaze 21*4882a593Smuzhiyun * Input : Operand1 in Reg r5 - destination address 22*4882a593Smuzhiyun * Operand2 in Reg r6 - source address 23*4882a593Smuzhiyun * Operand3 in Reg r7 - number of bytes to transfer 24*4882a593Smuzhiyun * Output: Result in Reg r3 - starting destinaition address 25*4882a593Smuzhiyun * 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * Explanation: 28*4882a593Smuzhiyun * Perform (possibly unaligned) copy of a block of memory 29*4882a593Smuzhiyun * between mem locations with size of xfer spec'd in bytes 30*4882a593Smuzhiyun */ 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun#include <linux/linkage.h> 33*4882a593Smuzhiyun .text 34*4882a593Smuzhiyun .globl memcpy 35*4882a593Smuzhiyun .type memcpy, @function 36*4882a593Smuzhiyun .ent memcpy 37*4882a593Smuzhiyun 38*4882a593Smuzhiyunmemcpy: 39*4882a593Smuzhiyunfast_memcpy_ascending: 40*4882a593Smuzhiyun /* move d to return register as value of function */ 41*4882a593Smuzhiyun addi r3, r5, 0 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun addi r4, r0, 4 /* n = 4 */ 44*4882a593Smuzhiyun cmpu r4, r4, r7 /* n = c - n (unsigned) */ 45*4882a593Smuzhiyun blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun /* transfer first 0~3 bytes to get aligned dest address */ 48*4882a593Smuzhiyun andi r4, r5, 3 /* n = d & 3 */ 49*4882a593Smuzhiyun /* if zero, destination already aligned */ 50*4882a593Smuzhiyun beqi r4, a_dalign_done 51*4882a593Smuzhiyun /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ 52*4882a593Smuzhiyun rsubi r4, r4, 4 53*4882a593Smuzhiyun rsub r7, r4, r7 /* c = c - n adjust c */ 54*4882a593Smuzhiyun 55*4882a593Smuzhiyuna_xfer_first_loop: 56*4882a593Smuzhiyun /* if no bytes left to transfer, transfer the bulk */ 57*4882a593Smuzhiyun beqi r4, a_dalign_done 58*4882a593Smuzhiyun lbui r11, r6, 0 /* h = *s */ 59*4882a593Smuzhiyun sbi r11, r5, 0 /* *d = h */ 60*4882a593Smuzhiyun addi r6, r6, 1 /* s++ */ 61*4882a593Smuzhiyun addi r5, r5, 1 /* d++ */ 62*4882a593Smuzhiyun brid a_xfer_first_loop /* loop */ 63*4882a593Smuzhiyun addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 64*4882a593Smuzhiyun 65*4882a593Smuzhiyuna_dalign_done: 66*4882a593Smuzhiyun addi r4, r0, 32 /* n = 32 */ 67*4882a593Smuzhiyun cmpu r4, r4, r7 /* n = c - n (unsigned) */ 68*4882a593Smuzhiyun /* if n < 0, less than one block to transfer */ 69*4882a593Smuzhiyun blti r4, a_block_done 70*4882a593Smuzhiyun 71*4882a593Smuzhiyuna_block_xfer: 72*4882a593Smuzhiyun andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 73*4882a593Smuzhiyun rsub r7, r4, r7 /* c = c - n */ 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun andi r9, r6, 3 /* t1 = s & 3 */ 76*4882a593Smuzhiyun /* if temp != 0, unaligned transfers needed */ 77*4882a593Smuzhiyun bnei r9, a_block_unaligned 78*4882a593Smuzhiyun 79*4882a593Smuzhiyuna_block_aligned: 80*4882a593Smuzhiyun lwi r9, r6, 0 /* t1 = *(s + 0) */ 81*4882a593Smuzhiyun lwi r10, r6, 4 /* t2 = *(s + 4) */ 82*4882a593Smuzhiyun lwi r11, r6, 8 /* t3 = *(s + 8) */ 83*4882a593Smuzhiyun lwi r12, r6, 12 /* t4 = *(s + 12) */ 84*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 85*4882a593Smuzhiyun swi r10, r5, 4 /* *(d + 4) = t2 */ 86*4882a593Smuzhiyun swi r11, r5, 8 /* *(d + 8) = t3 */ 87*4882a593Smuzhiyun swi r12, r5, 12 /* *(d + 12) = t4 */ 88*4882a593Smuzhiyun lwi r9, r6, 16 /* t1 = *(s + 16) */ 89*4882a593Smuzhiyun lwi r10, r6, 20 /* t2 = *(s + 20) */ 90*4882a593Smuzhiyun lwi r11, r6, 24 /* t3 = *(s + 24) */ 91*4882a593Smuzhiyun lwi r12, r6, 28 /* t4 = *(s + 28) */ 92*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 93*4882a593Smuzhiyun swi r10, r5, 20 /* *(d + 20) = t2 */ 94*4882a593Smuzhiyun swi r11, r5, 24 /* *(d + 24) = t3 */ 95*4882a593Smuzhiyun swi r12, r5, 28 /* *(d + 28) = t4 */ 96*4882a593Smuzhiyun addi r6, r6, 32 /* s = s + 32 */ 97*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 98*4882a593Smuzhiyun bneid r4, a_block_aligned /* while (n) loop */ 99*4882a593Smuzhiyun addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 100*4882a593Smuzhiyun bri a_block_done 101*4882a593Smuzhiyun 102*4882a593Smuzhiyuna_block_unaligned: 103*4882a593Smuzhiyun andi r8, r6, 0xfffffffc /* as = s & ~3 */ 104*4882a593Smuzhiyun add r6, r6, r4 /* s = s + n */ 105*4882a593Smuzhiyun lwi r11, r8, 0 /* h = *(as + 0) */ 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun addi r9, r9, -1 108*4882a593Smuzhiyun beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ 109*4882a593Smuzhiyun addi r9, r9, -1 110*4882a593Smuzhiyun beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ 111*4882a593Smuzhiyun 112*4882a593Smuzhiyuna_block_u3: 113*4882a593Smuzhiyun bslli r11, r11, 24 /* h = h << 24 */ 114*4882a593Smuzhiyuna_bu3_loop: 115*4882a593Smuzhiyun lwi r12, r8, 4 /* v = *(as + 4) */ 116*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 117*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 118*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 119*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 120*4882a593Smuzhiyun lwi r12, r8, 8 /* v = *(as + 8) */ 121*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 122*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 123*4882a593Smuzhiyun swi r9, r5, 4 /* *(d + 4) = t1 */ 124*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 125*4882a593Smuzhiyun lwi r12, r8, 12 /* v = *(as + 12) */ 126*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 127*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 128*4882a593Smuzhiyun swi r9, r5, 8 /* *(d + 8) = t1 */ 129*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 130*4882a593Smuzhiyun lwi r12, r8, 16 /* v = *(as + 16) */ 131*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 132*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 133*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 12) = t1 */ 134*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 135*4882a593Smuzhiyun lwi r12, r8, 20 /* v = *(as + 20) */ 136*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 137*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 138*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 139*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 140*4882a593Smuzhiyun lwi r12, r8, 24 /* v = *(as + 24) */ 141*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 142*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 143*4882a593Smuzhiyun swi r9, r5, 20 /* *(d + 20) = t1 */ 144*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 145*4882a593Smuzhiyun lwi r12, r8, 28 /* v = *(as + 28) */ 146*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 147*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 148*4882a593Smuzhiyun swi r9, r5, 24 /* *(d + 24) = t1 */ 149*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 150*4882a593Smuzhiyun lwi r12, r8, 32 /* v = *(as + 32) */ 151*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 152*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 153*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 154*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 155*4882a593Smuzhiyun addi r8, r8, 32 /* as = as + 32 */ 156*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 157*4882a593Smuzhiyun bneid r4, a_bu3_loop /* while (n) loop */ 158*4882a593Smuzhiyun addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 159*4882a593Smuzhiyun bri a_block_done 160*4882a593Smuzhiyun 161*4882a593Smuzhiyuna_block_u1: 162*4882a593Smuzhiyun bslli r11, r11, 8 /* h = h << 8 */ 163*4882a593Smuzhiyuna_bu1_loop: 164*4882a593Smuzhiyun lwi r12, r8, 4 /* v = *(as + 4) */ 165*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 166*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 167*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 168*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 169*4882a593Smuzhiyun lwi r12, r8, 8 /* v = *(as + 8) */ 170*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 171*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 172*4882a593Smuzhiyun swi r9, r5, 4 /* *(d + 4) = t1 */ 173*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 174*4882a593Smuzhiyun lwi r12, r8, 12 /* v = *(as + 12) */ 175*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 176*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 177*4882a593Smuzhiyun swi r9, r5, 8 /* *(d + 8) = t1 */ 178*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 179*4882a593Smuzhiyun lwi r12, r8, 16 /* v = *(as + 16) */ 180*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 181*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 182*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 12) = t1 */ 183*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 184*4882a593Smuzhiyun lwi r12, r8, 20 /* v = *(as + 20) */ 185*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 186*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 187*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 188*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 189*4882a593Smuzhiyun lwi r12, r8, 24 /* v = *(as + 24) */ 190*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 191*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 192*4882a593Smuzhiyun swi r9, r5, 20 /* *(d + 20) = t1 */ 193*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 194*4882a593Smuzhiyun lwi r12, r8, 28 /* v = *(as + 28) */ 195*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 196*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 197*4882a593Smuzhiyun swi r9, r5, 24 /* *(d + 24) = t1 */ 198*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 199*4882a593Smuzhiyun lwi r12, r8, 32 /* v = *(as + 32) */ 200*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 201*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 202*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 203*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 204*4882a593Smuzhiyun addi r8, r8, 32 /* as = as + 32 */ 205*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 206*4882a593Smuzhiyun bneid r4, a_bu1_loop /* while (n) loop */ 207*4882a593Smuzhiyun addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 208*4882a593Smuzhiyun bri a_block_done 209*4882a593Smuzhiyun 210*4882a593Smuzhiyuna_block_u2: 211*4882a593Smuzhiyun bslli r11, r11, 16 /* h = h << 16 */ 212*4882a593Smuzhiyuna_bu2_loop: 213*4882a593Smuzhiyun lwi r12, r8, 4 /* v = *(as + 4) */ 214*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 215*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 216*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 217*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 218*4882a593Smuzhiyun lwi r12, r8, 8 /* v = *(as + 8) */ 219*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 220*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 221*4882a593Smuzhiyun swi r9, r5, 4 /* *(d + 4) = t1 */ 222*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 223*4882a593Smuzhiyun lwi r12, r8, 12 /* v = *(as + 12) */ 224*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 225*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 226*4882a593Smuzhiyun swi r9, r5, 8 /* *(d + 8) = t1 */ 227*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 228*4882a593Smuzhiyun lwi r12, r8, 16 /* v = *(as + 16) */ 229*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 230*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 231*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 12) = t1 */ 232*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 233*4882a593Smuzhiyun lwi r12, r8, 20 /* v = *(as + 20) */ 234*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 235*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 236*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 237*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 238*4882a593Smuzhiyun lwi r12, r8, 24 /* v = *(as + 24) */ 239*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 240*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 241*4882a593Smuzhiyun swi r9, r5, 20 /* *(d + 20) = t1 */ 242*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 243*4882a593Smuzhiyun lwi r12, r8, 28 /* v = *(as + 28) */ 244*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 245*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 246*4882a593Smuzhiyun swi r9, r5, 24 /* *(d + 24) = t1 */ 247*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 248*4882a593Smuzhiyun lwi r12, r8, 32 /* v = *(as + 32) */ 249*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 250*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 251*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 252*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 253*4882a593Smuzhiyun addi r8, r8, 32 /* as = as + 32 */ 254*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 255*4882a593Smuzhiyun bneid r4, a_bu2_loop /* while (n) loop */ 256*4882a593Smuzhiyun addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 257*4882a593Smuzhiyun 258*4882a593Smuzhiyuna_block_done: 259*4882a593Smuzhiyun addi r4, r0, 4 /* n = 4 */ 260*4882a593Smuzhiyun cmpu r4, r4, r7 /* n = c - n (unsigned) */ 261*4882a593Smuzhiyun blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 262*4882a593Smuzhiyun 263*4882a593Smuzhiyuna_word_xfer: 264*4882a593Smuzhiyun andi r4, r7, 0xfffffffc /* n = c & ~3 */ 265*4882a593Smuzhiyun addi r10, r0, 0 /* offset = 0 */ 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun andi r9, r6, 3 /* t1 = s & 3 */ 268*4882a593Smuzhiyun /* if temp != 0, unaligned transfers needed */ 269*4882a593Smuzhiyun bnei r9, a_word_unaligned 270*4882a593Smuzhiyun 271*4882a593Smuzhiyuna_word_aligned: 272*4882a593Smuzhiyun lw r9, r6, r10 /* t1 = *(s+offset) */ 273*4882a593Smuzhiyun sw r9, r5, r10 /* *(d+offset) = t1 */ 274*4882a593Smuzhiyun addi r4, r4,-4 /* n-- */ 275*4882a593Smuzhiyun bneid r4, a_word_aligned /* loop */ 276*4882a593Smuzhiyun addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ 277*4882a593Smuzhiyun 278*4882a593Smuzhiyun bri a_word_done 279*4882a593Smuzhiyun 280*4882a593Smuzhiyuna_word_unaligned: 281*4882a593Smuzhiyun andi r8, r6, 0xfffffffc /* as = s & ~3 */ 282*4882a593Smuzhiyun lwi r11, r8, 0 /* h = *(as + 0) */ 283*4882a593Smuzhiyun addi r8, r8, 4 /* as = as + 4 */ 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun addi r9, r9, -1 286*4882a593Smuzhiyun beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ 287*4882a593Smuzhiyun addi r9, r9, -1 288*4882a593Smuzhiyun beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ 289*4882a593Smuzhiyun 290*4882a593Smuzhiyuna_word_u3: 291*4882a593Smuzhiyun bslli r11, r11, 24 /* h = h << 24 */ 292*4882a593Smuzhiyuna_wu3_loop: 293*4882a593Smuzhiyun lw r12, r8, r10 /* v = *(as + offset) */ 294*4882a593Smuzhiyun bsrli r9, r12, 8 /* t1 = v >> 8 */ 295*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 296*4882a593Smuzhiyun sw r9, r5, r10 /* *(d + offset) = t1 */ 297*4882a593Smuzhiyun bslli r11, r12, 24 /* h = v << 24 */ 298*4882a593Smuzhiyun addi r4, r4,-4 /* n = n - 4 */ 299*4882a593Smuzhiyun bneid r4, a_wu3_loop /* while (n) loop */ 300*4882a593Smuzhiyun addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun bri a_word_done 303*4882a593Smuzhiyun 304*4882a593Smuzhiyuna_word_u1: 305*4882a593Smuzhiyun bslli r11, r11, 8 /* h = h << 8 */ 306*4882a593Smuzhiyuna_wu1_loop: 307*4882a593Smuzhiyun lw r12, r8, r10 /* v = *(as + offset) */ 308*4882a593Smuzhiyun bsrli r9, r12, 24 /* t1 = v >> 24 */ 309*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 310*4882a593Smuzhiyun sw r9, r5, r10 /* *(d + offset) = t1 */ 311*4882a593Smuzhiyun bslli r11, r12, 8 /* h = v << 8 */ 312*4882a593Smuzhiyun addi r4, r4,-4 /* n = n - 4 */ 313*4882a593Smuzhiyun bneid r4, a_wu1_loop /* while (n) loop */ 314*4882a593Smuzhiyun addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 315*4882a593Smuzhiyun 316*4882a593Smuzhiyun bri a_word_done 317*4882a593Smuzhiyun 318*4882a593Smuzhiyuna_word_u2: 319*4882a593Smuzhiyun bslli r11, r11, 16 /* h = h << 16 */ 320*4882a593Smuzhiyuna_wu2_loop: 321*4882a593Smuzhiyun lw r12, r8, r10 /* v = *(as + offset) */ 322*4882a593Smuzhiyun bsrli r9, r12, 16 /* t1 = v >> 16 */ 323*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 324*4882a593Smuzhiyun sw r9, r5, r10 /* *(d + offset) = t1 */ 325*4882a593Smuzhiyun bslli r11, r12, 16 /* h = v << 16 */ 326*4882a593Smuzhiyun addi r4, r4,-4 /* n = n - 4 */ 327*4882a593Smuzhiyun bneid r4, a_wu2_loop /* while (n) loop */ 328*4882a593Smuzhiyun addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 329*4882a593Smuzhiyun 330*4882a593Smuzhiyuna_word_done: 331*4882a593Smuzhiyun add r5, r5, r10 /* d = d + offset */ 332*4882a593Smuzhiyun add r6, r6, r10 /* s = s + offset */ 333*4882a593Smuzhiyun rsub r7, r10, r7 /* c = c - offset */ 334*4882a593Smuzhiyun 335*4882a593Smuzhiyuna_xfer_end: 336*4882a593Smuzhiyuna_xfer_end_loop: 337*4882a593Smuzhiyun beqi r7, a_done /* while (c) */ 338*4882a593Smuzhiyun lbui r9, r6, 0 /* t1 = *s */ 339*4882a593Smuzhiyun addi r6, r6, 1 /* s++ */ 340*4882a593Smuzhiyun sbi r9, r5, 0 /* *d = t1 */ 341*4882a593Smuzhiyun addi r7, r7, -1 /* c-- */ 342*4882a593Smuzhiyun brid a_xfer_end_loop /* loop */ 343*4882a593Smuzhiyun addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ 344*4882a593Smuzhiyun 345*4882a593Smuzhiyuna_done: 346*4882a593Smuzhiyun rtsd r15, 8 347*4882a593Smuzhiyun nop 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun.size memcpy, . - memcpy 350*4882a593Smuzhiyun.end memcpy 351*4882a593Smuzhiyun/*----------------------------------------------------------------------------*/ 352*4882a593Smuzhiyun .globl memmove 353*4882a593Smuzhiyun .type memmove, @function 354*4882a593Smuzhiyun .ent memmove 355*4882a593Smuzhiyun 356*4882a593Smuzhiyunmemmove: 357*4882a593Smuzhiyun cmpu r4, r5, r6 /* n = s - d */ 358*4882a593Smuzhiyun bgei r4,fast_memcpy_ascending 359*4882a593Smuzhiyun 360*4882a593Smuzhiyunfast_memcpy_descending: 361*4882a593Smuzhiyun /* move d to return register as value of function */ 362*4882a593Smuzhiyun addi r3, r5, 0 363*4882a593Smuzhiyun 364*4882a593Smuzhiyun add r5, r5, r7 /* d = d + c */ 365*4882a593Smuzhiyun add r6, r6, r7 /* s = s + c */ 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun addi r4, r0, 4 /* n = 4 */ 368*4882a593Smuzhiyun cmpu r4, r4, r7 /* n = c - n (unsigned) */ 369*4882a593Smuzhiyun blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun /* transfer first 0~3 bytes to get aligned dest address */ 372*4882a593Smuzhiyun andi r4, r5, 3 /* n = d & 3 */ 373*4882a593Smuzhiyun /* if zero, destination already aligned */ 374*4882a593Smuzhiyun beqi r4,d_dalign_done 375*4882a593Smuzhiyun rsub r7, r4, r7 /* c = c - n adjust c */ 376*4882a593Smuzhiyun 377*4882a593Smuzhiyund_xfer_first_loop: 378*4882a593Smuzhiyun /* if no bytes left to transfer, transfer the bulk */ 379*4882a593Smuzhiyun beqi r4,d_dalign_done 380*4882a593Smuzhiyun addi r6, r6, -1 /* s-- */ 381*4882a593Smuzhiyun addi r5, r5, -1 /* d-- */ 382*4882a593Smuzhiyun lbui r11, r6, 0 /* h = *s */ 383*4882a593Smuzhiyun sbi r11, r5, 0 /* *d = h */ 384*4882a593Smuzhiyun brid d_xfer_first_loop /* loop */ 385*4882a593Smuzhiyun addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 386*4882a593Smuzhiyun 387*4882a593Smuzhiyund_dalign_done: 388*4882a593Smuzhiyun addi r4, r0, 32 /* n = 32 */ 389*4882a593Smuzhiyun cmpu r4, r4, r7 /* n = c - n (unsigned) */ 390*4882a593Smuzhiyun /* if n < 0, less than one block to transfer */ 391*4882a593Smuzhiyun blti r4, d_block_done 392*4882a593Smuzhiyun 393*4882a593Smuzhiyund_block_xfer: 394*4882a593Smuzhiyun andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 395*4882a593Smuzhiyun rsub r7, r4, r7 /* c = c - n */ 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun andi r9, r6, 3 /* t1 = s & 3 */ 398*4882a593Smuzhiyun /* if temp != 0, unaligned transfers needed */ 399*4882a593Smuzhiyun bnei r9, d_block_unaligned 400*4882a593Smuzhiyun 401*4882a593Smuzhiyund_block_aligned: 402*4882a593Smuzhiyun addi r6, r6, -32 /* s = s - 32 */ 403*4882a593Smuzhiyun addi r5, r5, -32 /* d = d - 32 */ 404*4882a593Smuzhiyun lwi r9, r6, 28 /* t1 = *(s + 28) */ 405*4882a593Smuzhiyun lwi r10, r6, 24 /* t2 = *(s + 24) */ 406*4882a593Smuzhiyun lwi r11, r6, 20 /* t3 = *(s + 20) */ 407*4882a593Smuzhiyun lwi r12, r6, 16 /* t4 = *(s + 16) */ 408*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 409*4882a593Smuzhiyun swi r10, r5, 24 /* *(d + 24) = t2 */ 410*4882a593Smuzhiyun swi r11, r5, 20 /* *(d + 20) = t3 */ 411*4882a593Smuzhiyun swi r12, r5, 16 /* *(d + 16) = t4 */ 412*4882a593Smuzhiyun lwi r9, r6, 12 /* t1 = *(s + 12) */ 413*4882a593Smuzhiyun lwi r10, r6, 8 /* t2 = *(s + 8) */ 414*4882a593Smuzhiyun lwi r11, r6, 4 /* t3 = *(s + 4) */ 415*4882a593Smuzhiyun lwi r12, r6, 0 /* t4 = *(s + 0) */ 416*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 12) = t1 */ 417*4882a593Smuzhiyun swi r10, r5, 8 /* *(d + 8) = t2 */ 418*4882a593Smuzhiyun swi r11, r5, 4 /* *(d + 4) = t3 */ 419*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 420*4882a593Smuzhiyun bneid r4, d_block_aligned /* while (n) loop */ 421*4882a593Smuzhiyun swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ 422*4882a593Smuzhiyun bri d_block_done 423*4882a593Smuzhiyun 424*4882a593Smuzhiyund_block_unaligned: 425*4882a593Smuzhiyun andi r8, r6, 0xfffffffc /* as = s & ~3 */ 426*4882a593Smuzhiyun rsub r6, r4, r6 /* s = s - n */ 427*4882a593Smuzhiyun lwi r11, r8, 0 /* h = *(as + 0) */ 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun addi r9, r9, -1 430*4882a593Smuzhiyun beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ 431*4882a593Smuzhiyun addi r9, r9, -1 432*4882a593Smuzhiyun beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ 433*4882a593Smuzhiyun 434*4882a593Smuzhiyund_block_u3: 435*4882a593Smuzhiyun bsrli r11, r11, 8 /* h = h >> 8 */ 436*4882a593Smuzhiyund_bu3_loop: 437*4882a593Smuzhiyun addi r8, r8, -32 /* as = as - 32 */ 438*4882a593Smuzhiyun addi r5, r5, -32 /* d = d - 32 */ 439*4882a593Smuzhiyun lwi r12, r8, 28 /* v = *(as + 28) */ 440*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 441*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 442*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 443*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 444*4882a593Smuzhiyun lwi r12, r8, 24 /* v = *(as + 24) */ 445*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 446*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 447*4882a593Smuzhiyun swi r9, r5, 24 /* *(d + 24) = t1 */ 448*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 449*4882a593Smuzhiyun lwi r12, r8, 20 /* v = *(as + 20) */ 450*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 451*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 452*4882a593Smuzhiyun swi r9, r5, 20 /* *(d + 20) = t1 */ 453*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 454*4882a593Smuzhiyun lwi r12, r8, 16 /* v = *(as + 16) */ 455*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 456*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 457*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 458*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 459*4882a593Smuzhiyun lwi r12, r8, 12 /* v = *(as + 12) */ 460*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 461*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 462*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 112) = t1 */ 463*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 464*4882a593Smuzhiyun lwi r12, r8, 8 /* v = *(as + 8) */ 465*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 466*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 467*4882a593Smuzhiyun swi r9, r5, 8 /* *(d + 8) = t1 */ 468*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 469*4882a593Smuzhiyun lwi r12, r8, 4 /* v = *(as + 4) */ 470*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 471*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 472*4882a593Smuzhiyun swi r9, r5, 4 /* *(d + 4) = t1 */ 473*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 */ 474*4882a593Smuzhiyun lwi r12, r8, 0 /* v = *(as + 0) */ 475*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 476*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 477*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 478*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 479*4882a593Smuzhiyun bneid r4, d_bu3_loop /* while (n) loop */ 480*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 481*4882a593Smuzhiyun bri d_block_done 482*4882a593Smuzhiyun 483*4882a593Smuzhiyund_block_u1: 484*4882a593Smuzhiyun bsrli r11, r11, 24 /* h = h >> 24 */ 485*4882a593Smuzhiyund_bu1_loop: 486*4882a593Smuzhiyun addi r8, r8, -32 /* as = as - 32 */ 487*4882a593Smuzhiyun addi r5, r5, -32 /* d = d - 32 */ 488*4882a593Smuzhiyun lwi r12, r8, 28 /* v = *(as + 28) */ 489*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 490*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 491*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 492*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 493*4882a593Smuzhiyun lwi r12, r8, 24 /* v = *(as + 24) */ 494*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 495*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 496*4882a593Smuzhiyun swi r9, r5, 24 /* *(d + 24) = t1 */ 497*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 498*4882a593Smuzhiyun lwi r12, r8, 20 /* v = *(as + 20) */ 499*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 500*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 501*4882a593Smuzhiyun swi r9, r5, 20 /* *(d + 20) = t1 */ 502*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 503*4882a593Smuzhiyun lwi r12, r8, 16 /* v = *(as + 16) */ 504*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 505*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 506*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 507*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 508*4882a593Smuzhiyun lwi r12, r8, 12 /* v = *(as + 12) */ 509*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 510*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 511*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 112) = t1 */ 512*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 513*4882a593Smuzhiyun lwi r12, r8, 8 /* v = *(as + 8) */ 514*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 515*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 516*4882a593Smuzhiyun swi r9, r5, 8 /* *(d + 8) = t1 */ 517*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 518*4882a593Smuzhiyun lwi r12, r8, 4 /* v = *(as + 4) */ 519*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 520*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 521*4882a593Smuzhiyun swi r9, r5, 4 /* *(d + 4) = t1 */ 522*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 */ 523*4882a593Smuzhiyun lwi r12, r8, 0 /* v = *(as + 0) */ 524*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 525*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 526*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 527*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 528*4882a593Smuzhiyun bneid r4, d_bu1_loop /* while (n) loop */ 529*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 530*4882a593Smuzhiyun bri d_block_done 531*4882a593Smuzhiyun 532*4882a593Smuzhiyund_block_u2: 533*4882a593Smuzhiyun bsrli r11, r11, 16 /* h = h >> 16 */ 534*4882a593Smuzhiyund_bu2_loop: 535*4882a593Smuzhiyun addi r8, r8, -32 /* as = as - 32 */ 536*4882a593Smuzhiyun addi r5, r5, -32 /* d = d - 32 */ 537*4882a593Smuzhiyun lwi r12, r8, 28 /* v = *(as + 28) */ 538*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 539*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 540*4882a593Smuzhiyun swi r9, r5, 28 /* *(d + 28) = t1 */ 541*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 542*4882a593Smuzhiyun lwi r12, r8, 24 /* v = *(as + 24) */ 543*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 544*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 545*4882a593Smuzhiyun swi r9, r5, 24 /* *(d + 24) = t1 */ 546*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 547*4882a593Smuzhiyun lwi r12, r8, 20 /* v = *(as + 20) */ 548*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 549*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 550*4882a593Smuzhiyun swi r9, r5, 20 /* *(d + 20) = t1 */ 551*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 552*4882a593Smuzhiyun lwi r12, r8, 16 /* v = *(as + 16) */ 553*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 554*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 555*4882a593Smuzhiyun swi r9, r5, 16 /* *(d + 16) = t1 */ 556*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 557*4882a593Smuzhiyun lwi r12, r8, 12 /* v = *(as + 12) */ 558*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 559*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 560*4882a593Smuzhiyun swi r9, r5, 12 /* *(d + 112) = t1 */ 561*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 562*4882a593Smuzhiyun lwi r12, r8, 8 /* v = *(as + 8) */ 563*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 564*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 565*4882a593Smuzhiyun swi r9, r5, 8 /* *(d + 8) = t1 */ 566*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 567*4882a593Smuzhiyun lwi r12, r8, 4 /* v = *(as + 4) */ 568*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 569*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 570*4882a593Smuzhiyun swi r9, r5, 4 /* *(d + 4) = t1 */ 571*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 */ 572*4882a593Smuzhiyun lwi r12, r8, 0 /* v = *(as + 0) */ 573*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 574*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 575*4882a593Smuzhiyun swi r9, r5, 0 /* *(d + 0) = t1 */ 576*4882a593Smuzhiyun addi r4, r4, -32 /* n = n - 32 */ 577*4882a593Smuzhiyun bneid r4, d_bu2_loop /* while (n) loop */ 578*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 579*4882a593Smuzhiyun 580*4882a593Smuzhiyund_block_done: 581*4882a593Smuzhiyun addi r4, r0, 4 /* n = 4 */ 582*4882a593Smuzhiyun cmpu r4, r4, r7 /* n = c - n (unsigned) */ 583*4882a593Smuzhiyun blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 584*4882a593Smuzhiyun 585*4882a593Smuzhiyund_word_xfer: 586*4882a593Smuzhiyun andi r4, r7, 0xfffffffc /* n = c & ~3 */ 587*4882a593Smuzhiyun rsub r5, r4, r5 /* d = d - n */ 588*4882a593Smuzhiyun rsub r6, r4, r6 /* s = s - n */ 589*4882a593Smuzhiyun rsub r7, r4, r7 /* c = c - n */ 590*4882a593Smuzhiyun 591*4882a593Smuzhiyun andi r9, r6, 3 /* t1 = s & 3 */ 592*4882a593Smuzhiyun /* if temp != 0, unaligned transfers needed */ 593*4882a593Smuzhiyun bnei r9, d_word_unaligned 594*4882a593Smuzhiyun 595*4882a593Smuzhiyund_word_aligned: 596*4882a593Smuzhiyun addi r4, r4,-4 /* n-- */ 597*4882a593Smuzhiyun lw r9, r6, r4 /* t1 = *(s+n) */ 598*4882a593Smuzhiyun bneid r4, d_word_aligned /* loop */ 599*4882a593Smuzhiyun sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ 600*4882a593Smuzhiyun 601*4882a593Smuzhiyun bri d_word_done 602*4882a593Smuzhiyun 603*4882a593Smuzhiyund_word_unaligned: 604*4882a593Smuzhiyun andi r8, r6, 0xfffffffc /* as = s & ~3 */ 605*4882a593Smuzhiyun lw r11, r8, r4 /* h = *(as + n) */ 606*4882a593Smuzhiyun 607*4882a593Smuzhiyun addi r9, r9, -1 608*4882a593Smuzhiyun beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ 609*4882a593Smuzhiyun addi r9, r9, -1 610*4882a593Smuzhiyun beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ 611*4882a593Smuzhiyun 612*4882a593Smuzhiyund_word_u3: 613*4882a593Smuzhiyun bsrli r11, r11, 8 /* h = h >> 8 */ 614*4882a593Smuzhiyund_wu3_loop: 615*4882a593Smuzhiyun addi r4, r4,-4 /* n = n - 4 */ 616*4882a593Smuzhiyun lw r12, r8, r4 /* v = *(as + n) */ 617*4882a593Smuzhiyun bslli r9, r12, 24 /* t1 = v << 24 */ 618*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 619*4882a593Smuzhiyun sw r9, r5, r4 /* *(d + n) = t1 */ 620*4882a593Smuzhiyun bneid r4, d_wu3_loop /* while (n) loop */ 621*4882a593Smuzhiyun bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 622*4882a593Smuzhiyun 623*4882a593Smuzhiyun bri d_word_done 624*4882a593Smuzhiyun 625*4882a593Smuzhiyund_word_u1: 626*4882a593Smuzhiyun bsrli r11, r11, 24 /* h = h >> 24 */ 627*4882a593Smuzhiyund_wu1_loop: 628*4882a593Smuzhiyun addi r4, r4,-4 /* n = n - 4 */ 629*4882a593Smuzhiyun lw r12, r8, r4 /* v = *(as + n) */ 630*4882a593Smuzhiyun bslli r9, r12, 8 /* t1 = v << 8 */ 631*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 632*4882a593Smuzhiyun sw r9, r5, r4 /* *(d + n) = t1 */ 633*4882a593Smuzhiyun bneid r4, d_wu1_loop /* while (n) loop */ 634*4882a593Smuzhiyun bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 635*4882a593Smuzhiyun 636*4882a593Smuzhiyun bri d_word_done 637*4882a593Smuzhiyun 638*4882a593Smuzhiyund_word_u2: 639*4882a593Smuzhiyun bsrli r11, r11, 16 /* h = h >> 16 */ 640*4882a593Smuzhiyund_wu2_loop: 641*4882a593Smuzhiyun addi r4, r4,-4 /* n = n - 4 */ 642*4882a593Smuzhiyun lw r12, r8, r4 /* v = *(as + n) */ 643*4882a593Smuzhiyun bslli r9, r12, 16 /* t1 = v << 16 */ 644*4882a593Smuzhiyun or r9, r11, r9 /* t1 = h | t1 */ 645*4882a593Smuzhiyun sw r9, r5, r4 /* *(d + n) = t1 */ 646*4882a593Smuzhiyun bneid r4, d_wu2_loop /* while (n) loop */ 647*4882a593Smuzhiyun bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 648*4882a593Smuzhiyun 649*4882a593Smuzhiyund_word_done: 650*4882a593Smuzhiyun 651*4882a593Smuzhiyund_xfer_end: 652*4882a593Smuzhiyund_xfer_end_loop: 653*4882a593Smuzhiyun beqi r7, a_done /* while (c) */ 654*4882a593Smuzhiyun addi r6, r6, -1 /* s-- */ 655*4882a593Smuzhiyun lbui r9, r6, 0 /* t1 = *s */ 656*4882a593Smuzhiyun addi r5, r5, -1 /* d-- */ 657*4882a593Smuzhiyun sbi r9, r5, 0 /* *d = t1 */ 658*4882a593Smuzhiyun brid d_xfer_end_loop /* loop */ 659*4882a593Smuzhiyun addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ 660*4882a593Smuzhiyun 661*4882a593Smuzhiyund_done: 662*4882a593Smuzhiyun rtsd r15, 8 663*4882a593Smuzhiyun nop 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun.size memmove, . - memmove 666*4882a593Smuzhiyun.end memmove 667