1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * "memcpy" implementation of SuperH 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 1999 Niibe Yutaka 6*4882a593Smuzhiyun * Copyright (c) 2002 STMicroelectronics Ltd 7*4882a593Smuzhiyun * Modified from memcpy.S and micro-optimised for SH4 8*4882a593Smuzhiyun * Stuart Menefy (stuart.menefy@st.com) 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun */ 11*4882a593Smuzhiyun#include <linux/linkage.h> 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun/* 14*4882a593Smuzhiyun * void *memcpy(void *dst, const void *src, size_t n); 15*4882a593Smuzhiyun * 16*4882a593Smuzhiyun * It is assumed that there is no overlap between src and dst. 17*4882a593Smuzhiyun * If there is an overlap, then the results are undefined. 18*4882a593Smuzhiyun */ 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun ! 21*4882a593Smuzhiyun ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. 22*4882a593Smuzhiyun ! 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun ! Size is 16 or greater, and may have trailing bytes 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun .balign 32 27*4882a593Smuzhiyun.Lcase1: 28*4882a593Smuzhiyun ! Read a long word and write a long word at once 29*4882a593Smuzhiyun ! At the start of each iteration, r7 contains last long load 30*4882a593Smuzhiyun add #-1,r5 ! 79 EX 31*4882a593Smuzhiyun mov r4,r2 ! 5 MT (0 cycles latency) 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 34*4882a593Smuzhiyun add #-4,r5 ! 50 EX 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun add #7,r2 ! 79 EX 37*4882a593Smuzhiyun ! 38*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN 39*4882a593Smuzhiyun ! 6 cycles, 4 bytes per iteration 40*4882a593Smuzhiyun3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 41*4882a593Smuzhiyun mov r7, r3 ! 5 MT (latency=0) ! RQPO 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun cmp/hi r2,r0 ! 57 MT 44*4882a593Smuzhiyun shll16 r3 ! 103 EX 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun mov r1,r6 ! 5 MT (latency=0) 47*4882a593Smuzhiyun shll8 r3 ! 102 EX ! Oxxx 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun shlr8 r6 ! 106 EX ! xNML 50*4882a593Smuzhiyun mov r1, r7 ! 5 MT (latency=0) 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun or r6,r3 ! 82 EX ! ONML 53*4882a593Smuzhiyun bt/s 3b ! 109 BR 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun mov.l r3,@-r0 ! 30 LS 56*4882a593Smuzhiyun#else 57*4882a593Smuzhiyun3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN 58*4882a593Smuzhiyun mov r7,r3 ! 5 MT (latency=0) ! OPQR 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun cmp/hi r2,r0 ! 57 MT 61*4882a593Smuzhiyun shlr16 r3 ! 107 EX 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun shlr8 r3 ! 106 EX ! xxxO 64*4882a593Smuzhiyun mov r1,r6 ! 5 MT (latency=0) 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun shll8 r6 ! 102 EX ! LMNx 67*4882a593Smuzhiyun mov r1,r7 ! 5 MT (latency=0) 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun or r6,r3 ! 82 EX ! LMNO 70*4882a593Smuzhiyun bt/s 3b ! 109 BR 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun mov.l r3,@-r0 ! 30 LS 73*4882a593Smuzhiyun#endif 74*4882a593Smuzhiyun ! Finally, copy a byte at once, if necessary 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun add #4,r5 ! 50 EX 77*4882a593Smuzhiyun cmp/eq r4,r0 ! 54 MT 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun add #-6,r2 ! 50 EX 80*4882a593Smuzhiyun bt 9f ! 109 BR 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun8: cmp/hi r2,r0 ! 57 MT 83*4882a593Smuzhiyun mov.b @(r0,r5),r1 ! 20 LS (latency=2) 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun bt/s 8b ! 109 BR 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun mov.b r1,@-r0 ! 29 LS 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun9: rts 90*4882a593Smuzhiyun nop 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun ! 94*4882a593Smuzhiyun ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... 95*4882a593Smuzhiyun ! 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun ! Size is 16 or greater, and may have trailing bytes 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun .balign 32 100*4882a593Smuzhiyun.Lcase3: 101*4882a593Smuzhiyun ! Read a long word and write a long word at once 102*4882a593Smuzhiyun ! At the start of each iteration, r7 contains last long load 103*4882a593Smuzhiyun add #-3,r5 ! 79 EX 104*4882a593Smuzhiyun mov r4,r2 ! 5 MT (0 cycles latency) 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 107*4882a593Smuzhiyun add #-4,r5 ! 50 EX 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun add #7,r2 ! 79 EX 110*4882a593Smuzhiyun ! 111*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN 112*4882a593Smuzhiyun ! 6 cycles, 4 bytes per iteration 113*4882a593Smuzhiyun3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 114*4882a593Smuzhiyun mov r7, r3 ! 5 MT (latency=0) ! RQPO 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun cmp/hi r2,r0 ! 57 MT 117*4882a593Smuzhiyun shll8 r3 ! 102 EX ! QPOx 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun mov r1,r6 ! 5 MT (latency=0) 120*4882a593Smuzhiyun shlr16 r6 ! 107 EX 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun shlr8 r6 ! 106 EX ! xxxN 123*4882a593Smuzhiyun mov r1, r7 ! 5 MT (latency=0) 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun or r6,r3 ! 82 EX ! QPON 126*4882a593Smuzhiyun bt/s 3b ! 109 BR 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun mov.l r3,@-r0 ! 30 LS 129*4882a593Smuzhiyun#else 130*4882a593Smuzhiyun3: mov r7,r3 ! OPQR 131*4882a593Smuzhiyun shlr8 r3 ! xOPQ 132*4882a593Smuzhiyun mov.l @(r0,r5),r7 ! KLMN 133*4882a593Smuzhiyun mov r7,r6 134*4882a593Smuzhiyun shll16 r6 135*4882a593Smuzhiyun shll8 r6 ! Nxxx 136*4882a593Smuzhiyun or r6,r3 ! NOPQ 137*4882a593Smuzhiyun cmp/hi r2,r0 138*4882a593Smuzhiyun bt/s 3b 139*4882a593Smuzhiyun mov.l r3,@-r0 140*4882a593Smuzhiyun#endif 141*4882a593Smuzhiyun 142*4882a593Smuzhiyun ! Finally, copy a byte at once, if necessary 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun add #6,r5 ! 50 EX 145*4882a593Smuzhiyun cmp/eq r4,r0 ! 54 MT 146*4882a593Smuzhiyun 147*4882a593Smuzhiyun add #-6,r2 ! 50 EX 148*4882a593Smuzhiyun bt 9f ! 109 BR 149*4882a593Smuzhiyun 150*4882a593Smuzhiyun8: cmp/hi r2,r0 ! 57 MT 151*4882a593Smuzhiyun mov.b @(r0,r5),r1 ! 20 LS (latency=2) 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun bt/s 8b ! 109 BR 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun mov.b r1,@-r0 ! 29 LS 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun9: rts 158*4882a593Smuzhiyun nop 159*4882a593Smuzhiyun 160*4882a593SmuzhiyunENTRY(memcpy) 161*4882a593Smuzhiyun 162*4882a593Smuzhiyun ! Calculate the invariants which will be used in the remainder 163*4882a593Smuzhiyun ! of the code: 164*4882a593Smuzhiyun ! 165*4882a593Smuzhiyun ! r4 --> [ ... ] DST [ ... ] SRC 166*4882a593Smuzhiyun ! [ ... ] [ ... ] 167*4882a593Smuzhiyun ! : : 168*4882a593Smuzhiyun ! r0 --> [ ... ] r0+r5 --> [ ... ] 169*4882a593Smuzhiyun ! 170*4882a593Smuzhiyun ! 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun ! Short circuit the common case of src, dst and len being 32 bit aligned 173*4882a593Smuzhiyun ! and test for zero length move 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun mov r6, r0 ! 5 MT (0 cycle latency) 176*4882a593Smuzhiyun or r4, r0 ! 82 EX 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun or r5, r0 ! 82 EX 179*4882a593Smuzhiyun tst r6, r6 ! 86 MT 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun bt/s 99f ! 111 BR (zero len) 182*4882a593Smuzhiyun tst #3, r0 ! 87 MT 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun mov r4, r0 ! 5 MT (0 cycle latency) 185*4882a593Smuzhiyun add r6, r0 ! 49 EX 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun mov #16, r1 ! 6 EX 188*4882a593Smuzhiyun bt/s .Lcase00 ! 111 BR (aligned) 189*4882a593Smuzhiyun 190*4882a593Smuzhiyun sub r4, r5 ! 75 EX 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun ! Arguments are not nicely long word aligned or zero len. 193*4882a593Smuzhiyun ! Check for small copies, and if so do a simple byte at a time copy. 194*4882a593Smuzhiyun ! 195*4882a593Smuzhiyun ! Deciding on an exact value of 'small' is not easy, as the point at which 196*4882a593Smuzhiyun ! using the optimised routines become worthwhile varies (these are the 197*4882a593Smuzhiyun ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): 198*4882a593Smuzhiyun ! size byte-at-time long word byte 199*4882a593Smuzhiyun ! 16 42 39-40 46-50 50-55 200*4882a593Smuzhiyun ! 24 58 43-44 54-58 62-67 201*4882a593Smuzhiyun ! 36 82 49-50 66-70 80-85 202*4882a593Smuzhiyun ! However the penalty for getting it 'wrong' is much higher for long word 203*4882a593Smuzhiyun ! aligned data (and this is more common), so use a value of 16. 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun cmp/gt r6,r1 ! 56 MT 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun add #-1,r5 ! 50 EX 208*4882a593Smuzhiyun bf/s 6f ! 108 BR (not small) 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun mov r5, r3 ! 5 MT (latency=0) 211*4882a593Smuzhiyun shlr r6 ! 104 EX 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun mov.b @(r0,r5),r1 ! 20 LS (latency=2) 214*4882a593Smuzhiyun bf/s 4f ! 111 BR 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun add #-1,r3 ! 50 EX 217*4882a593Smuzhiyun tst r6, r6 ! 86 MT 218*4882a593Smuzhiyun 219*4882a593Smuzhiyun bt/s 98f ! 110 BR 220*4882a593Smuzhiyun mov.b r1,@-r0 ! 29 LS 221*4882a593Smuzhiyun 222*4882a593Smuzhiyun ! 4 cycles, 2 bytes per iteration 223*4882a593Smuzhiyun3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) 224*4882a593Smuzhiyun 225*4882a593Smuzhiyun4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) 226*4882a593Smuzhiyun dt r6 ! 67 EX 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun mov.b r1,@-r0 ! 29 LS 229*4882a593Smuzhiyun bf/s 3b ! 111 BR 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun mov.b r2,@-r0 ! 29 LS 232*4882a593Smuzhiyun98: 233*4882a593Smuzhiyun rts 234*4882a593Smuzhiyun nop 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun99: rts 237*4882a593Smuzhiyun mov r4, r0 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun ! Size is not small, so its worthwhile looking for optimisations. 240*4882a593Smuzhiyun ! First align destination to a long word boundary. 241*4882a593Smuzhiyun ! 242*4882a593Smuzhiyun ! r5 = normal value -1 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun6: tst #3, r0 ! 87 MT 245*4882a593Smuzhiyun mov #3, r3 ! 6 EX 246*4882a593Smuzhiyun 247*4882a593Smuzhiyun bt/s 2f ! 111 BR 248*4882a593Smuzhiyun and r0,r3 ! 78 EX 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun ! 3 cycles, 1 byte per iteration 251*4882a593Smuzhiyun1: dt r3 ! 67 EX 252*4882a593Smuzhiyun mov.b @(r0,r5),r1 ! 19 LS (latency=2) 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun add #-1, r6 ! 79 EX 255*4882a593Smuzhiyun bf/s 1b ! 109 BR 256*4882a593Smuzhiyun 257*4882a593Smuzhiyun mov.b r1,@-r0 ! 28 LS 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun2: add #1, r5 ! 79 EX 260*4882a593Smuzhiyun 261*4882a593Smuzhiyun ! Now select the appropriate bulk transfer code based on relative 262*4882a593Smuzhiyun ! alignment of src and dst. 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun mov r0, r3 ! 5 MT (latency=0) 265*4882a593Smuzhiyun 266*4882a593Smuzhiyun mov r5, r0 ! 5 MT (latency=0) 267*4882a593Smuzhiyun tst #1, r0 ! 87 MT 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun bf/s 1f ! 111 BR 270*4882a593Smuzhiyun mov #64, r7 ! 6 EX 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun ! bit 0 clear 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun cmp/ge r7, r6 ! 55 MT 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun bt/s 2f ! 111 BR 277*4882a593Smuzhiyun tst #2, r0 ! 87 MT 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun ! small 280*4882a593Smuzhiyun bt/s .Lcase0 281*4882a593Smuzhiyun mov r3, r0 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun bra .Lcase2 284*4882a593Smuzhiyun nop 285*4882a593Smuzhiyun 286*4882a593Smuzhiyun ! big 287*4882a593Smuzhiyun2: bt/s .Lcase0b 288*4882a593Smuzhiyun mov r3, r0 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun bra .Lcase2b 291*4882a593Smuzhiyun nop 292*4882a593Smuzhiyun 293*4882a593Smuzhiyun ! bit 0 set 294*4882a593Smuzhiyun1: tst #2, r0 ! 87 MT 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun bt/s .Lcase1 297*4882a593Smuzhiyun mov r3, r0 298*4882a593Smuzhiyun 299*4882a593Smuzhiyun bra .Lcase3 300*4882a593Smuzhiyun nop 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun ! 304*4882a593Smuzhiyun ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR 305*4882a593Smuzhiyun ! 306*4882a593Smuzhiyun 307*4882a593Smuzhiyun ! src, dst and size are all long word aligned 308*4882a593Smuzhiyun ! size is non-zero 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun .balign 32 311*4882a593Smuzhiyun.Lcase00: 312*4882a593Smuzhiyun mov #64, r1 ! 6 EX 313*4882a593Smuzhiyun mov r5, r3 ! 5 MT (latency=0) 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun cmp/gt r6, r1 ! 56 MT 316*4882a593Smuzhiyun add #-4, r5 ! 50 EX 317*4882a593Smuzhiyun 318*4882a593Smuzhiyun bf .Lcase00b ! 108 BR (big loop) 319*4882a593Smuzhiyun shlr2 r6 ! 105 EX 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun shlr r6 ! 104 EX 322*4882a593Smuzhiyun mov.l @(r0, r5), r1 ! 21 LS (latency=2) 323*4882a593Smuzhiyun 324*4882a593Smuzhiyun bf/s 4f ! 111 BR 325*4882a593Smuzhiyun add #-8, r3 ! 50 EX 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun tst r6, r6 ! 86 MT 328*4882a593Smuzhiyun bt/s 5f ! 110 BR 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun mov.l r1,@-r0 ! 30 LS 331*4882a593Smuzhiyun 332*4882a593Smuzhiyun ! 4 cycles, 2 long words per iteration 333*4882a593Smuzhiyun3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 334*4882a593Smuzhiyun 335*4882a593Smuzhiyun4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 336*4882a593Smuzhiyun dt r6 ! 67 EX 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun mov.l r1, @-r0 ! 30 LS 339*4882a593Smuzhiyun bf/s 3b ! 109 BR 340*4882a593Smuzhiyun 341*4882a593Smuzhiyun mov.l r2, @-r0 ! 30 LS 342*4882a593Smuzhiyun 343*4882a593Smuzhiyun5: rts 344*4882a593Smuzhiyun nop 345*4882a593Smuzhiyun 346*4882a593Smuzhiyun 347*4882a593Smuzhiyun ! Size is 16 or greater and less than 64, but may have trailing bytes 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun .balign 32 350*4882a593Smuzhiyun.Lcase0: 351*4882a593Smuzhiyun add #-4, r5 ! 50 EX 352*4882a593Smuzhiyun mov r4, r7 ! 5 MT (latency=0) 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun mov.l @(r0, r5), r1 ! 21 LS (latency=2) 355*4882a593Smuzhiyun mov #4, r2 ! 6 EX 356*4882a593Smuzhiyun 357*4882a593Smuzhiyun add #11, r7 ! 50 EX 358*4882a593Smuzhiyun tst r2, r6 ! 86 MT 359*4882a593Smuzhiyun 360*4882a593Smuzhiyun mov r5, r3 ! 5 MT (latency=0) 361*4882a593Smuzhiyun bt/s 4f ! 111 BR 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun add #-4, r3 ! 50 EX 364*4882a593Smuzhiyun mov.l r1,@-r0 ! 30 LS 365*4882a593Smuzhiyun 366*4882a593Smuzhiyun ! 4 cycles, 2 long words per iteration 367*4882a593Smuzhiyun3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 368*4882a593Smuzhiyun 369*4882a593Smuzhiyun4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 370*4882a593Smuzhiyun cmp/hi r7, r0 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun mov.l r1, @-r0 ! 30 LS 373*4882a593Smuzhiyun bt/s 3b ! 109 BR 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun mov.l r2, @-r0 ! 30 LS 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun ! Copy the final 0-3 bytes 378*4882a593Smuzhiyun 379*4882a593Smuzhiyun add #3,r5 ! 50 EX 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun cmp/eq r0, r4 ! 54 MT 382*4882a593Smuzhiyun add #-10, r7 ! 50 EX 383*4882a593Smuzhiyun 384*4882a593Smuzhiyun bt 9f ! 110 BR 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun ! 3 cycles, 1 byte per iteration 387*4882a593Smuzhiyun1: mov.b @(r0,r5),r1 ! 19 LS 388*4882a593Smuzhiyun cmp/hi r7,r0 ! 57 MT 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun bt/s 1b ! 111 BR 391*4882a593Smuzhiyun mov.b r1,@-r0 ! 28 LS 392*4882a593Smuzhiyun 393*4882a593Smuzhiyun9: rts 394*4882a593Smuzhiyun nop 395*4882a593Smuzhiyun 396*4882a593Smuzhiyun ! Size is at least 64 bytes, so will be going round the big loop at least once. 397*4882a593Smuzhiyun ! 398*4882a593Smuzhiyun ! r2 = rounded up r4 399*4882a593Smuzhiyun ! r3 = rounded down r0 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun .balign 32 402*4882a593Smuzhiyun.Lcase0b: 403*4882a593Smuzhiyun add #-4, r5 ! 50 EX 404*4882a593Smuzhiyun 405*4882a593Smuzhiyun.Lcase00b: 406*4882a593Smuzhiyun mov r0, r3 ! 5 MT (latency=0) 407*4882a593Smuzhiyun mov #(~0x1f), r1 ! 6 EX 408*4882a593Smuzhiyun 409*4882a593Smuzhiyun and r1, r3 ! 78 EX 410*4882a593Smuzhiyun mov r4, r2 ! 5 MT (latency=0) 411*4882a593Smuzhiyun 412*4882a593Smuzhiyun cmp/eq r3, r0 ! 54 MT 413*4882a593Smuzhiyun add #0x1f, r2 ! 50 EX 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun bt/s 1f ! 110 BR 416*4882a593Smuzhiyun and r1, r2 ! 78 EX 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun ! copy initial words until cache line aligned 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun mov.l @(r0, r5), r1 ! 21 LS (latency=2) 421*4882a593Smuzhiyun tst #4, r0 ! 87 MT 422*4882a593Smuzhiyun 423*4882a593Smuzhiyun mov r5, r6 ! 5 MT (latency=0) 424*4882a593Smuzhiyun add #-4, r6 ! 50 EX 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun bt/s 4f ! 111 BR 427*4882a593Smuzhiyun add #8, r3 ! 50 EX 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun tst #0x18, r0 ! 87 MT 430*4882a593Smuzhiyun 431*4882a593Smuzhiyun bt/s 1f ! 109 BR 432*4882a593Smuzhiyun mov.l r1,@-r0 ! 30 LS 433*4882a593Smuzhiyun 434*4882a593Smuzhiyun ! 4 cycles, 2 long words per iteration 435*4882a593Smuzhiyun3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 436*4882a593Smuzhiyun 437*4882a593Smuzhiyun4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) 438*4882a593Smuzhiyun cmp/eq r3, r0 ! 54 MT 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun mov.l r1, @-r0 ! 30 LS 441*4882a593Smuzhiyun bf/s 3b ! 109 BR 442*4882a593Smuzhiyun 443*4882a593Smuzhiyun mov.l r7, @-r0 ! 30 LS 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun ! Copy the cache line aligned blocks 446*4882a593Smuzhiyun ! 447*4882a593Smuzhiyun ! In use: r0, r2, r4, r5 448*4882a593Smuzhiyun ! Scratch: r1, r3, r6, r7 449*4882a593Smuzhiyun ! 450*4882a593Smuzhiyun ! We could do this with the four scratch registers, but if src 451*4882a593Smuzhiyun ! and dest hit the same cache line, this will thrash, so make 452*4882a593Smuzhiyun ! use of additional registers. 453*4882a593Smuzhiyun ! 454*4882a593Smuzhiyun ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 455*4882a593Smuzhiyun ! r5: src (was r0+r5) 456*4882a593Smuzhiyun ! r1: dest (was r0) 457*4882a593Smuzhiyun ! this can be reversed at the end, so we don't need to save any extra 458*4882a593Smuzhiyun ! state. 459*4882a593Smuzhiyun ! 460*4882a593Smuzhiyun1: mov.l r8, @-r15 ! 30 LS 461*4882a593Smuzhiyun add r0, r5 ! 49 EX 462*4882a593Smuzhiyun 463*4882a593Smuzhiyun mov.l r9, @-r15 ! 30 LS 464*4882a593Smuzhiyun mov r0, r1 ! 5 MT (latency=0) 465*4882a593Smuzhiyun 466*4882a593Smuzhiyun mov.l r10, @-r15 ! 30 LS 467*4882a593Smuzhiyun add #-0x1c, r5 ! 50 EX 468*4882a593Smuzhiyun 469*4882a593Smuzhiyun mov.l r11, @-r15 ! 30 LS 470*4882a593Smuzhiyun 471*4882a593Smuzhiyun ! 16 cycles, 32 bytes per iteration 472*4882a593Smuzhiyun2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) 473*4882a593Smuzhiyun add #-0x20, r1 ! 50 EX 474*4882a593Smuzhiyun mov.l @(0x04,r5),r3 ! 18 LS (latency=2) 475*4882a593Smuzhiyun mov.l @(0x08,r5),r6 ! 18 LS (latency=2) 476*4882a593Smuzhiyun mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) 477*4882a593Smuzhiyun mov.l @(0x10,r5),r8 ! 18 LS (latency=2) 478*4882a593Smuzhiyun mov.l @(0x14,r5),r9 ! 18 LS (latency=2) 479*4882a593Smuzhiyun mov.l @(0x18,r5),r10 ! 18 LS (latency=2) 480*4882a593Smuzhiyun mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) 481*4882a593Smuzhiyun movca.l r0,@r1 ! 40 LS (latency=3-7) 482*4882a593Smuzhiyun mov.l r3,@(0x04,r1) ! 33 LS 483*4882a593Smuzhiyun mov.l r6,@(0x08,r1) ! 33 LS 484*4882a593Smuzhiyun mov.l r7,@(0x0c,r1) ! 33 LS 485*4882a593Smuzhiyun 486*4882a593Smuzhiyun mov.l r8,@(0x10,r1) ! 33 LS 487*4882a593Smuzhiyun add #-0x20, r5 ! 50 EX 488*4882a593Smuzhiyun 489*4882a593Smuzhiyun mov.l r9,@(0x14,r1) ! 33 LS 490*4882a593Smuzhiyun cmp/eq r2,r1 ! 54 MT 491*4882a593Smuzhiyun 492*4882a593Smuzhiyun mov.l r10,@(0x18,r1) ! 33 LS 493*4882a593Smuzhiyun bf/s 2b ! 109 BR 494*4882a593Smuzhiyun 495*4882a593Smuzhiyun mov.l r11,@(0x1c,r1) ! 33 LS 496*4882a593Smuzhiyun 497*4882a593Smuzhiyun mov r1, r0 ! 5 MT (latency=0) 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun mov.l @r15+, r11 ! 15 LS 500*4882a593Smuzhiyun sub r1, r5 ! 75 EX 501*4882a593Smuzhiyun 502*4882a593Smuzhiyun mov.l @r15+, r10 ! 15 LS 503*4882a593Smuzhiyun cmp/eq r4, r0 ! 54 MT 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun bf/s 1f ! 109 BR 506*4882a593Smuzhiyun mov.l @r15+, r9 ! 15 LS 507*4882a593Smuzhiyun 508*4882a593Smuzhiyun rts 509*4882a593Smuzhiyun1: mov.l @r15+, r8 ! 15 LS 510*4882a593Smuzhiyun sub r4, r1 ! 75 EX (len remaining) 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun ! number of trailing bytes is non-zero 513*4882a593Smuzhiyun ! 514*4882a593Smuzhiyun ! invariants restored (r5 already decremented by 4) 515*4882a593Smuzhiyun ! also r1=num bytes remaining 516*4882a593Smuzhiyun 517*4882a593Smuzhiyun mov #4, r2 ! 6 EX 518*4882a593Smuzhiyun mov r4, r7 ! 5 MT (latency=0) 519*4882a593Smuzhiyun 520*4882a593Smuzhiyun add #0x1c, r5 ! 50 EX (back to -4) 521*4882a593Smuzhiyun cmp/hs r2, r1 ! 58 MT 522*4882a593Smuzhiyun 523*4882a593Smuzhiyun bf/s 5f ! 108 BR 524*4882a593Smuzhiyun add #11, r7 ! 50 EX 525*4882a593Smuzhiyun 526*4882a593Smuzhiyun mov.l @(r0, r5), r6 ! 21 LS (latency=2) 527*4882a593Smuzhiyun tst r2, r1 ! 86 MT 528*4882a593Smuzhiyun 529*4882a593Smuzhiyun mov r5, r3 ! 5 MT (latency=0) 530*4882a593Smuzhiyun bt/s 4f ! 111 BR 531*4882a593Smuzhiyun 532*4882a593Smuzhiyun add #-4, r3 ! 50 EX 533*4882a593Smuzhiyun cmp/hs r2, r1 ! 58 MT 534*4882a593Smuzhiyun 535*4882a593Smuzhiyun bt/s 5f ! 111 BR 536*4882a593Smuzhiyun mov.l r6,@-r0 ! 30 LS 537*4882a593Smuzhiyun 538*4882a593Smuzhiyun ! 4 cycles, 2 long words per iteration 539*4882a593Smuzhiyun3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) 540*4882a593Smuzhiyun 541*4882a593Smuzhiyun4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 542*4882a593Smuzhiyun cmp/hi r7, r0 543*4882a593Smuzhiyun 544*4882a593Smuzhiyun mov.l r6, @-r0 ! 30 LS 545*4882a593Smuzhiyun bt/s 3b ! 109 BR 546*4882a593Smuzhiyun 547*4882a593Smuzhiyun mov.l r2, @-r0 ! 30 LS 548*4882a593Smuzhiyun 549*4882a593Smuzhiyun ! Copy the final 0-3 bytes 550*4882a593Smuzhiyun 551*4882a593Smuzhiyun5: cmp/eq r0, r4 ! 54 MT 552*4882a593Smuzhiyun add #-10, r7 ! 50 EX 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun bt 9f ! 110 BR 555*4882a593Smuzhiyun add #3,r5 ! 50 EX 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun ! 3 cycles, 1 byte per iteration 558*4882a593Smuzhiyun1: mov.b @(r0,r5),r1 ! 19 LS 559*4882a593Smuzhiyun cmp/hi r7,r0 ! 57 MT 560*4882a593Smuzhiyun 561*4882a593Smuzhiyun bt/s 1b ! 111 BR 562*4882a593Smuzhiyun mov.b r1,@-r0 ! 28 LS 563*4882a593Smuzhiyun 564*4882a593Smuzhiyun9: rts 565*4882a593Smuzhiyun nop 566*4882a593Smuzhiyun 567*4882a593Smuzhiyun ! 568*4882a593Smuzhiyun ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. 569*4882a593Smuzhiyun ! 570*4882a593Smuzhiyun 571*4882a593Smuzhiyun .balign 32 572*4882a593Smuzhiyun.Lcase2: 573*4882a593Smuzhiyun ! Size is 16 or greater and less then 64, but may have trailing bytes 574*4882a593Smuzhiyun 575*4882a593Smuzhiyun2: mov r5, r6 ! 5 MT (latency=0) 576*4882a593Smuzhiyun add #-2,r5 ! 50 EX 577*4882a593Smuzhiyun 578*4882a593Smuzhiyun mov r4,r2 ! 5 MT (latency=0) 579*4882a593Smuzhiyun add #-4,r6 ! 50 EX 580*4882a593Smuzhiyun 581*4882a593Smuzhiyun add #7,r2 ! 50 EX 582*4882a593Smuzhiyun3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 583*4882a593Smuzhiyun 584*4882a593Smuzhiyun mov.w @(r0,r6),r3 ! 20 LS (latency=2) 585*4882a593Smuzhiyun cmp/hi r2,r0 ! 57 MT 586*4882a593Smuzhiyun 587*4882a593Smuzhiyun mov.w r1,@-r0 ! 29 LS 588*4882a593Smuzhiyun bt/s 3b ! 111 BR 589*4882a593Smuzhiyun 590*4882a593Smuzhiyun mov.w r3,@-r0 ! 29 LS 591*4882a593Smuzhiyun 592*4882a593Smuzhiyun bra 10f 593*4882a593Smuzhiyun nop 594*4882a593Smuzhiyun 595*4882a593Smuzhiyun 596*4882a593Smuzhiyun .balign 32 597*4882a593Smuzhiyun.Lcase2b: 598*4882a593Smuzhiyun ! Size is at least 64 bytes, so will be going round the big loop at least once. 599*4882a593Smuzhiyun ! 600*4882a593Smuzhiyun ! r2 = rounded up r4 601*4882a593Smuzhiyun ! r3 = rounded down r0 602*4882a593Smuzhiyun 603*4882a593Smuzhiyun mov r0, r3 ! 5 MT (latency=0) 604*4882a593Smuzhiyun mov #(~0x1f), r1 ! 6 EX 605*4882a593Smuzhiyun 606*4882a593Smuzhiyun and r1, r3 ! 78 EX 607*4882a593Smuzhiyun mov r4, r2 ! 5 MT (latency=0) 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun cmp/eq r3, r0 ! 54 MT 610*4882a593Smuzhiyun add #0x1f, r2 ! 50 EX 611*4882a593Smuzhiyun 612*4882a593Smuzhiyun add #-2, r5 ! 50 EX 613*4882a593Smuzhiyun bt/s 1f ! 110 BR 614*4882a593Smuzhiyun and r1, r2 ! 78 EX 615*4882a593Smuzhiyun 616*4882a593Smuzhiyun ! Copy a short word one at a time until we are cache line aligned 617*4882a593Smuzhiyun ! Normal values: r0, r2, r3, r4 618*4882a593Smuzhiyun ! Unused: r1, r6, r7 619*4882a593Smuzhiyun ! Mod: r5 (=r5-2) 620*4882a593Smuzhiyun ! 621*4882a593Smuzhiyun add #2, r3 ! 50 EX 622*4882a593Smuzhiyun 623*4882a593Smuzhiyun2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 624*4882a593Smuzhiyun cmp/eq r3,r0 ! 54 MT 625*4882a593Smuzhiyun 626*4882a593Smuzhiyun bf/s 2b ! 111 BR 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun mov.w r1,@-r0 ! 29 LS 629*4882a593Smuzhiyun 630*4882a593Smuzhiyun ! Copy the cache line aligned blocks 631*4882a593Smuzhiyun ! 632*4882a593Smuzhiyun ! In use: r0, r2, r4, r5 (=r5-2) 633*4882a593Smuzhiyun ! Scratch: r1, r3, r6, r7 634*4882a593Smuzhiyun ! 635*4882a593Smuzhiyun ! We could do this with the four scratch registers, but if src 636*4882a593Smuzhiyun ! and dest hit the same cache line, this will thrash, so make 637*4882a593Smuzhiyun ! use of additional registers. 638*4882a593Smuzhiyun ! 639*4882a593Smuzhiyun ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 640*4882a593Smuzhiyun ! r5: src (was r0+r5) 641*4882a593Smuzhiyun ! r1: dest (was r0) 642*4882a593Smuzhiyun ! this can be reversed at the end, so we don't need to save any extra 643*4882a593Smuzhiyun ! state. 644*4882a593Smuzhiyun ! 645*4882a593Smuzhiyun1: mov.l r8, @-r15 ! 30 LS 646*4882a593Smuzhiyun add r0, r5 ! 49 EX 647*4882a593Smuzhiyun 648*4882a593Smuzhiyun mov.l r9, @-r15 ! 30 LS 649*4882a593Smuzhiyun mov r0, r1 ! 5 MT (latency=0) 650*4882a593Smuzhiyun 651*4882a593Smuzhiyun mov.l r10, @-r15 ! 30 LS 652*4882a593Smuzhiyun add #-0x1e, r5 ! 50 EX 653*4882a593Smuzhiyun 654*4882a593Smuzhiyun mov.l r11, @-r15 ! 30 LS 655*4882a593Smuzhiyun 656*4882a593Smuzhiyun mov.l r12, @-r15 ! 30 LS 657*4882a593Smuzhiyun 658*4882a593Smuzhiyun ! 17 cycles, 32 bytes per iteration 659*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN 660*4882a593Smuzhiyun2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI 661*4882a593Smuzhiyun add #-0x20, r1 ! 50 EX 662*4882a593Smuzhiyun 663*4882a593Smuzhiyun mov.l @r5+, r3 ! 15 LS (latency=2) NMLK 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun mov.l @r5+, r6 ! 15 LS (latency=2) RQPO 666*4882a593Smuzhiyun shll16 r0 ! 103 EX JI.. 667*4882a593Smuzhiyun 668*4882a593Smuzhiyun mov.l @r5+, r7 ! 15 LS (latency=2) 669*4882a593Smuzhiyun xtrct r3, r0 ! 48 EX LKJI 670*4882a593Smuzhiyun 671*4882a593Smuzhiyun mov.l @r5+, r8 ! 15 LS (latency=2) 672*4882a593Smuzhiyun xtrct r6, r3 ! 48 EX PONM 673*4882a593Smuzhiyun 674*4882a593Smuzhiyun mov.l @r5+, r9 ! 15 LS (latency=2) 675*4882a593Smuzhiyun xtrct r7, r6 ! 48 EX 676*4882a593Smuzhiyun 677*4882a593Smuzhiyun mov.l @r5+, r10 ! 15 LS (latency=2) 678*4882a593Smuzhiyun xtrct r8, r7 ! 48 EX 679*4882a593Smuzhiyun 680*4882a593Smuzhiyun mov.l @r5+, r11 ! 15 LS (latency=2) 681*4882a593Smuzhiyun xtrct r9, r8 ! 48 EX 682*4882a593Smuzhiyun 683*4882a593Smuzhiyun mov.w @r5+, r12 ! 15 LS (latency=2) 684*4882a593Smuzhiyun xtrct r10, r9 ! 48 EX 685*4882a593Smuzhiyun 686*4882a593Smuzhiyun movca.l r0,@r1 ! 40 LS (latency=3-7) 687*4882a593Smuzhiyun xtrct r11, r10 ! 48 EX 688*4882a593Smuzhiyun 689*4882a593Smuzhiyun mov.l r3, @(0x04,r1) ! 33 LS 690*4882a593Smuzhiyun xtrct r12, r11 ! 48 EX 691*4882a593Smuzhiyun 692*4882a593Smuzhiyun mov.l r6, @(0x08,r1) ! 33 LS 693*4882a593Smuzhiyun 694*4882a593Smuzhiyun mov.l r7, @(0x0c,r1) ! 33 LS 695*4882a593Smuzhiyun 696*4882a593Smuzhiyun mov.l r8, @(0x10,r1) ! 33 LS 697*4882a593Smuzhiyun add #-0x40, r5 ! 50 EX 698*4882a593Smuzhiyun 699*4882a593Smuzhiyun mov.l r9, @(0x14,r1) ! 33 LS 700*4882a593Smuzhiyun cmp/eq r2,r1 ! 54 MT 701*4882a593Smuzhiyun 702*4882a593Smuzhiyun mov.l r10, @(0x18,r1) ! 33 LS 703*4882a593Smuzhiyun bf/s 2b ! 109 BR 704*4882a593Smuzhiyun 705*4882a593Smuzhiyun mov.l r11, @(0x1c,r1) ! 33 LS 706*4882a593Smuzhiyun#else 707*4882a593Smuzhiyun2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) 708*4882a593Smuzhiyun add #-2, r5 ! 50 EX 709*4882a593Smuzhiyun 710*4882a593Smuzhiyun mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) 711*4882a593Smuzhiyun add #-4, r1 ! 50 EX 712*4882a593Smuzhiyun 713*4882a593Smuzhiyun mov.l @(0x18,r5), r6 ! 18 LS (latency=2) 714*4882a593Smuzhiyun shll16 r0 ! 103 EX 715*4882a593Smuzhiyun 716*4882a593Smuzhiyun mov.l @(0x14,r5), r7 ! 18 LS (latency=2) 717*4882a593Smuzhiyun xtrct r3, r0 ! 48 EX 718*4882a593Smuzhiyun 719*4882a593Smuzhiyun mov.l @(0x10,r5), r8 ! 18 LS (latency=2) 720*4882a593Smuzhiyun xtrct r6, r3 ! 48 EX 721*4882a593Smuzhiyun 722*4882a593Smuzhiyun mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) 723*4882a593Smuzhiyun xtrct r7, r6 ! 48 EX 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun mov.l @(0x08,r5), r10 ! 18 LS (latency=2) 726*4882a593Smuzhiyun xtrct r8, r7 ! 48 EX 727*4882a593Smuzhiyun 728*4882a593Smuzhiyun mov.l @(0x04,r5), r11 ! 18 LS (latency=2) 729*4882a593Smuzhiyun xtrct r9, r8 ! 48 EX 730*4882a593Smuzhiyun 731*4882a593Smuzhiyun mov.l @(0x00,r5), r12 ! 18 LS (latency=2) 732*4882a593Smuzhiyun xtrct r10, r9 ! 48 EX 733*4882a593Smuzhiyun 734*4882a593Smuzhiyun movca.l r0,@r1 ! 40 LS (latency=3-7) 735*4882a593Smuzhiyun add #-0x1c, r1 ! 50 EX 736*4882a593Smuzhiyun 737*4882a593Smuzhiyun mov.l r3, @(0x18,r1) ! 33 LS 738*4882a593Smuzhiyun xtrct r11, r10 ! 48 EX 739*4882a593Smuzhiyun 740*4882a593Smuzhiyun mov.l r6, @(0x14,r1) ! 33 LS 741*4882a593Smuzhiyun xtrct r12, r11 ! 48 EX 742*4882a593Smuzhiyun 743*4882a593Smuzhiyun mov.l r7, @(0x10,r1) ! 33 LS 744*4882a593Smuzhiyun 745*4882a593Smuzhiyun mov.l r8, @(0x0c,r1) ! 33 LS 746*4882a593Smuzhiyun add #-0x1e, r5 ! 50 EX 747*4882a593Smuzhiyun 748*4882a593Smuzhiyun mov.l r9, @(0x08,r1) ! 33 LS 749*4882a593Smuzhiyun cmp/eq r2,r1 ! 54 MT 750*4882a593Smuzhiyun 751*4882a593Smuzhiyun mov.l r10, @(0x04,r1) ! 33 LS 752*4882a593Smuzhiyun bf/s 2b ! 109 BR 753*4882a593Smuzhiyun 754*4882a593Smuzhiyun mov.l r11, @(0x00,r1) ! 33 LS 755*4882a593Smuzhiyun#endif 756*4882a593Smuzhiyun 757*4882a593Smuzhiyun mov.l @r15+, r12 758*4882a593Smuzhiyun mov r1, r0 ! 5 MT (latency=0) 759*4882a593Smuzhiyun 760*4882a593Smuzhiyun mov.l @r15+, r11 ! 15 LS 761*4882a593Smuzhiyun sub r1, r5 ! 75 EX 762*4882a593Smuzhiyun 763*4882a593Smuzhiyun mov.l @r15+, r10 ! 15 LS 764*4882a593Smuzhiyun cmp/eq r4, r0 ! 54 MT 765*4882a593Smuzhiyun 766*4882a593Smuzhiyun bf/s 1f ! 109 BR 767*4882a593Smuzhiyun mov.l @r15+, r9 ! 15 LS 768*4882a593Smuzhiyun 769*4882a593Smuzhiyun rts 770*4882a593Smuzhiyun1: mov.l @r15+, r8 ! 15 LS 771*4882a593Smuzhiyun 772*4882a593Smuzhiyun add #0x1e, r5 ! 50 EX 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun ! Finish off a short word at a time 775*4882a593Smuzhiyun ! r5 must be invariant - 2 776*4882a593Smuzhiyun10: mov r4,r2 ! 5 MT (latency=0) 777*4882a593Smuzhiyun add #1,r2 ! 50 EX 778*4882a593Smuzhiyun 779*4882a593Smuzhiyun cmp/hi r2, r0 ! 57 MT 780*4882a593Smuzhiyun bf/s 1f ! 109 BR 781*4882a593Smuzhiyun 782*4882a593Smuzhiyun add #2, r2 ! 50 EX 783*4882a593Smuzhiyun 784*4882a593Smuzhiyun3: mov.w @(r0,r5),r1 ! 20 LS 785*4882a593Smuzhiyun cmp/hi r2,r0 ! 57 MT 786*4882a593Smuzhiyun 787*4882a593Smuzhiyun bt/s 3b ! 109 BR 788*4882a593Smuzhiyun 789*4882a593Smuzhiyun mov.w r1,@-r0 ! 29 LS 790*4882a593Smuzhiyun1: 791*4882a593Smuzhiyun 792*4882a593Smuzhiyun ! 793*4882a593Smuzhiyun ! Finally, copy the last byte if necessary 794*4882a593Smuzhiyun cmp/eq r4,r0 ! 54 MT 795*4882a593Smuzhiyun bt/s 9b 796*4882a593Smuzhiyun add #1,r5 797*4882a593Smuzhiyun mov.b @(r0,r5),r1 798*4882a593Smuzhiyun rts 799*4882a593Smuzhiyun mov.b r1,@-r0 800*4882a593Smuzhiyun 801