1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. 4*4882a593Smuzhiyun */ 5*4882a593Smuzhiyun 6*4882a593Smuzhiyun/* 7*4882a593Smuzhiyun * Description 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * library function for memcpy where length bytes are copied from 10*4882a593Smuzhiyun * ptr_in to ptr_out. ptr_out is returned unchanged. 11*4882a593Smuzhiyun * Allows any combination of alignment on input and output pointers 12*4882a593Smuzhiyun * and length from 0 to 2^32-1 13*4882a593Smuzhiyun * 14*4882a593Smuzhiyun * Restrictions 15*4882a593Smuzhiyun * The arrays should not overlap, the program will produce undefined output 16*4882a593Smuzhiyun * if they do. 17*4882a593Smuzhiyun * For blocks less than 16 bytes a byte by byte copy is performed. For 18*4882a593Smuzhiyun * 8byte alignments, and length multiples, a dword copy is performed up to 19*4882a593Smuzhiyun * 96bytes 20*4882a593Smuzhiyun * History 21*4882a593Smuzhiyun * 22*4882a593Smuzhiyun * DJH 5/15/09 Initial version 1.0 23*4882a593Smuzhiyun * DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19 24*4882a593Smuzhiyun * DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840 25*4882a593Smuzhiyun * DJH 10/14/09 Version 1.3 added special loop for aligned case, was 26*4882a593Smuzhiyun * overreading bloated codesize back up to 892 27*4882a593Smuzhiyun * DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads 28*4882a593Smuzhiyun * occurring if only 1 left outstanding, fixes bug 29*4882a593Smuzhiyun * # 3888, corrected for all alignments. Peeled off 30*4882a593Smuzhiyun * 1 32byte chunk from kernel loop and extended 8byte 31*4882a593Smuzhiyun * loop at end to solve all combinations and prevent 32*4882a593Smuzhiyun * over read. Fixed Ldword_loop_prolog to prevent 33*4882a593Smuzhiyun * overread for blocks less than 48bytes. Reduced 34*4882a593Smuzhiyun * codesize to 752 bytes 35*4882a593Smuzhiyun * DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not 36*4882a593Smuzhiyun * aligned to dword boundaries,underwriting by 1 37*4882a593Smuzhiyun * byte, added detection for this and fixed. A 38*4882a593Smuzhiyun * little bloat. 39*4882a593Smuzhiyun * DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored 40*4882a593Smuzhiyun * always, fixed the error of R20 being modified 41*4882a593Smuzhiyun * before it was being saved 42*4882a593Smuzhiyun * Natural c model 43*4882a593Smuzhiyun * =============== 44*4882a593Smuzhiyun * void * memcpy(char * ptr_out, char * ptr_in, int length) { 45*4882a593Smuzhiyun * int i; 46*4882a593Smuzhiyun * if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; } 47*4882a593Smuzhiyun * return(ptr_out); 48*4882a593Smuzhiyun * } 49*4882a593Smuzhiyun * 50*4882a593Smuzhiyun * Optimized memcpy function 51*4882a593Smuzhiyun * ========================= 52*4882a593Smuzhiyun * void * memcpy(char * ptr_out, char * ptr_in, int len) { 53*4882a593Smuzhiyun * int i, prolog, kernel, epilog, mask; 54*4882a593Smuzhiyun * u8 offset; 55*4882a593Smuzhiyun * s64 data0, dataF8, data70; 56*4882a593Smuzhiyun * 57*4882a593Smuzhiyun * s64 * ptr8_in; 58*4882a593Smuzhiyun * s64 * ptr8_out; 59*4882a593Smuzhiyun * s32 * ptr4; 60*4882a593Smuzhiyun * s16 * ptr2; 61*4882a593Smuzhiyun * 62*4882a593Smuzhiyun * offset = ((int) ptr_in) & 7; 63*4882a593Smuzhiyun * ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers 64*4882a593Smuzhiyun * 65*4882a593Smuzhiyun * data70 = *ptr8_in++; 66*4882a593Smuzhiyun * dataF8 = *ptr8_in++; 67*4882a593Smuzhiyun * 68*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 69*4882a593Smuzhiyun * 70*4882a593Smuzhiyun * prolog = 32 - ((int) ptr_out); 71*4882a593Smuzhiyun * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len); 72*4882a593Smuzhiyun * prolog = prolog & mask; 73*4882a593Smuzhiyun * kernel = len - prolog; 74*4882a593Smuzhiyun * epilog = kernel & 0x1F; 75*4882a593Smuzhiyun * kernel = kernel>>5; 76*4882a593Smuzhiyun * 77*4882a593Smuzhiyun * if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;} 78*4882a593Smuzhiyun * ptr2 = (s16 *) &ptr_out[0]; 79*4882a593Smuzhiyun * if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 80*4882a593Smuzhiyun * ptr4 = (s32 *) &ptr_out[0]; 81*4882a593Smuzhiyun * if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 82*4882a593Smuzhiyun * 83*4882a593Smuzhiyun * offset = offset + (prolog & 7); 84*4882a593Smuzhiyun * if (offset >= 8) { 85*4882a593Smuzhiyun * data70 = dataF8; 86*4882a593Smuzhiyun * dataF8 = *ptr8_in++; 87*4882a593Smuzhiyun * } 88*4882a593Smuzhiyun * offset = offset & 0x7; 89*4882a593Smuzhiyun * 90*4882a593Smuzhiyun * prolog = prolog >> 3; 91*4882a593Smuzhiyun * if (prolog) for (i=0; i < prolog; i++) { 92*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 93*4882a593Smuzhiyun * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 94*4882a593Smuzhiyun * data70 = dataF8; 95*4882a593Smuzhiyun * dataF8 = *ptr8_in++; 96*4882a593Smuzhiyun * } 97*4882a593Smuzhiyun * if(kernel) { kernel -= 1; epilog += 32; } 98*4882a593Smuzhiyun * if(kernel) for(i=0; i < kernel; i++) { 99*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 100*4882a593Smuzhiyun * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 101*4882a593Smuzhiyun * data70 = *ptr8_in++; 102*4882a593Smuzhiyun * 103*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 104*4882a593Smuzhiyun * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 105*4882a593Smuzhiyun * dataF8 = *ptr8_in++; 106*4882a593Smuzhiyun * 107*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 108*4882a593Smuzhiyun * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 109*4882a593Smuzhiyun * data70 = *ptr8_in++; 110*4882a593Smuzhiyun * 111*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 112*4882a593Smuzhiyun * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 113*4882a593Smuzhiyun * dataF8 = *ptr8_in++; 114*4882a593Smuzhiyun * } 115*4882a593Smuzhiyun * epilogdws = epilog >> 3; 116*4882a593Smuzhiyun * if (epilogdws) for (i=0; i < epilogdws; i++) { 117*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 118*4882a593Smuzhiyun * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 119*4882a593Smuzhiyun * data70 = dataF8; 120*4882a593Smuzhiyun * dataF8 = *ptr8_in++; 121*4882a593Smuzhiyun * } 122*4882a593Smuzhiyun * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 123*4882a593Smuzhiyun * 124*4882a593Smuzhiyun * ptr4 = (s32 *) &ptr_out[0]; 125*4882a593Smuzhiyun * if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 126*4882a593Smuzhiyun * ptr2 = (s16 *) &ptr_out[0]; 127*4882a593Smuzhiyun * if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 128*4882a593Smuzhiyun * if (epilog & 1) { *ptr_out++ = (u8) data0; } 129*4882a593Smuzhiyun * 130*4882a593Smuzhiyun * return(ptr_out - length); 131*4882a593Smuzhiyun * } 132*4882a593Smuzhiyun * 133*4882a593Smuzhiyun * Codesize : 784 bytes 134*4882a593Smuzhiyun */ 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun#define ptr_out R0 /* destination pounter */ 138*4882a593Smuzhiyun#define ptr_in R1 /* source pointer */ 139*4882a593Smuzhiyun#define len R2 /* length of copy in bytes */ 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun#define data70 R13:12 /* lo 8 bytes of non-aligned transfer */ 142*4882a593Smuzhiyun#define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */ 143*4882a593Smuzhiyun#define ldata0 R7:6 /* even 8 bytes chunks */ 144*4882a593Smuzhiyun#define ldata1 R25:24 /* odd 8 bytes chunks */ 145*4882a593Smuzhiyun#define data1 R7 /* lower 8 bytes of ldata1 */ 146*4882a593Smuzhiyun#define data0 R6 /* lower 8 bytes of ldata0 */ 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun#define ifbyte p0 /* if transfer has bytes in epilog/prolog */ 149*4882a593Smuzhiyun#define ifhword p0 /* if transfer has shorts in epilog/prolog */ 150*4882a593Smuzhiyun#define ifword p0 /* if transfer has words in epilog/prolog */ 151*4882a593Smuzhiyun#define noprolog p0 /* no prolog, xfer starts at 32byte */ 152*4882a593Smuzhiyun#define nokernel p1 /* no 32byte multiple block in the transfer */ 153*4882a593Smuzhiyun#define noepilog p0 /* no epilog, xfer ends on 32byte boundary */ 154*4882a593Smuzhiyun#define align p2 /* alignment of input rel to 8byte boundary */ 155*4882a593Smuzhiyun#define kernel1 p0 /* kernel count == 1 */ 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun#define dalign R25 /* rel alignment of input to output data */ 158*4882a593Smuzhiyun#define star3 R16 /* number bytes in prolog - dwords */ 159*4882a593Smuzhiyun#define rest R8 /* length - prolog bytes */ 160*4882a593Smuzhiyun#define back R7 /* nr bytes > dword boundary in src block */ 161*4882a593Smuzhiyun#define epilog R3 /* bytes in epilog */ 162*4882a593Smuzhiyun#define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */ 163*4882a593Smuzhiyun#define kernel R4 /* number of 32byte chunks in kernel */ 164*4882a593Smuzhiyun#define ptr_in_p_128 R5 /* pointer for prefetch of input data */ 165*4882a593Smuzhiyun#define mask R8 /* mask used to determine prolog size */ 166*4882a593Smuzhiyun#define shift R8 /* used to work a shifter to extract bytes */ 167*4882a593Smuzhiyun#define shift2 R5 /* in epilog to workshifter to extract bytes */ 168*4882a593Smuzhiyun#define prolog R15 /* bytes in prolog */ 169*4882a593Smuzhiyun#define epilogdws R15 /* number dwords in epilog */ 170*4882a593Smuzhiyun#define shiftb R14 /* used to extract bytes */ 171*4882a593Smuzhiyun#define offset R9 /* same as align in reg */ 172*4882a593Smuzhiyun#define ptr_out_p_32 R17 /* pointer to output dczero */ 173*4882a593Smuzhiyun#define align888 R14 /* if simple dword loop can be used */ 174*4882a593Smuzhiyun#define len8 R9 /* number of dwords in length */ 175*4882a593Smuzhiyun#define over R20 /* nr of bytes > last inp buf dword boundary */ 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun#define ptr_in_p_128kernel R5:4 /* packed fetch pointer & kernel cnt */ 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun .section .text 180*4882a593Smuzhiyun .p2align 4 181*4882a593Smuzhiyun .global memcpy 182*4882a593Smuzhiyun .type memcpy, @function 183*4882a593Smuzhiyunmemcpy: 184*4882a593Smuzhiyun{ 185*4882a593Smuzhiyun p2 = cmp.eq(len, #0); /* =0 */ 186*4882a593Smuzhiyun align888 = or(ptr_in, ptr_out); /* %8 < 97 */ 187*4882a593Smuzhiyun p0 = cmp.gtu(len, #23); /* %1, <24 */ 188*4882a593Smuzhiyun p1 = cmp.eq(ptr_in, ptr_out); /* attempt to overwrite self */ 189*4882a593Smuzhiyun} 190*4882a593Smuzhiyun{ 191*4882a593Smuzhiyun p1 = or(p2, p1); 192*4882a593Smuzhiyun p3 = cmp.gtu(len, #95); /* %8 < 97 */ 193*4882a593Smuzhiyun align888 = or(align888, len); /* %8 < 97 */ 194*4882a593Smuzhiyun len8 = lsr(len, #3); /* %8 < 97 */ 195*4882a593Smuzhiyun} 196*4882a593Smuzhiyun{ 197*4882a593Smuzhiyun dcfetch(ptr_in); /* zero/ptrin=ptrout causes fetch */ 198*4882a593Smuzhiyun p2 = bitsclr(align888, #7); /* %8 < 97 */ 199*4882a593Smuzhiyun if(p1) jumpr r31; /* =0 */ 200*4882a593Smuzhiyun} 201*4882a593Smuzhiyun{ 202*4882a593Smuzhiyun p2 = and(p2,!p3); /* %8 < 97 */ 203*4882a593Smuzhiyun if (p2.new) len = add(len, #-8); /* %8 < 97 */ 204*4882a593Smuzhiyun if (p2.new) jump:NT .Ldwordaligned; /* %8 < 97 */ 205*4882a593Smuzhiyun} 206*4882a593Smuzhiyun{ 207*4882a593Smuzhiyun if(!p0) jump .Lbytes23orless; /* %1, <24 */ 208*4882a593Smuzhiyun mask.l = #LO(0x7fffffff); 209*4882a593Smuzhiyun /* all bytes before line multiples of data */ 210*4882a593Smuzhiyun prolog = sub(#0, ptr_out); 211*4882a593Smuzhiyun} 212*4882a593Smuzhiyun{ 213*4882a593Smuzhiyun /* save r31 on stack, decrement sp by 16 */ 214*4882a593Smuzhiyun allocframe(#24); 215*4882a593Smuzhiyun mask.h = #HI(0x7fffffff); 216*4882a593Smuzhiyun ptr_in_p_128 = add(ptr_in, #32); 217*4882a593Smuzhiyun back = cl0(len); 218*4882a593Smuzhiyun} 219*4882a593Smuzhiyun{ 220*4882a593Smuzhiyun memd(sp+#0) = R17:16; /* save r16,r17 on stack6 */ 221*4882a593Smuzhiyun r31.l = #LO(.Lmemcpy_return); /* set up final return pointer */ 222*4882a593Smuzhiyun prolog &= lsr(mask, back); 223*4882a593Smuzhiyun offset = and(ptr_in, #7); 224*4882a593Smuzhiyun} 225*4882a593Smuzhiyun{ 226*4882a593Smuzhiyun memd(sp+#8) = R25:24; /* save r25,r24 on stack */ 227*4882a593Smuzhiyun dalign = sub(ptr_out, ptr_in); 228*4882a593Smuzhiyun r31.h = #HI(.Lmemcpy_return); /* set up final return pointer */ 229*4882a593Smuzhiyun} 230*4882a593Smuzhiyun{ 231*4882a593Smuzhiyun /* see if there if input buffer end if aligned */ 232*4882a593Smuzhiyun over = add(len, ptr_in); 233*4882a593Smuzhiyun back = add(len, offset); 234*4882a593Smuzhiyun memd(sp+#16) = R21:20; /* save r20,r21 on stack */ 235*4882a593Smuzhiyun} 236*4882a593Smuzhiyun{ 237*4882a593Smuzhiyun noprolog = bitsclr(prolog, #7); 238*4882a593Smuzhiyun prolog = and(prolog, #31); 239*4882a593Smuzhiyun dcfetch(ptr_in_p_128); 240*4882a593Smuzhiyun ptr_in_p_128 = add(ptr_in_p_128, #32); 241*4882a593Smuzhiyun} 242*4882a593Smuzhiyun{ 243*4882a593Smuzhiyun kernel = sub(len, prolog); 244*4882a593Smuzhiyun shift = asl(prolog, #3); 245*4882a593Smuzhiyun star3 = and(prolog, #7); 246*4882a593Smuzhiyun ptr_in = and(ptr_in, #-8); 247*4882a593Smuzhiyun} 248*4882a593Smuzhiyun{ 249*4882a593Smuzhiyun prolog = lsr(prolog, #3); 250*4882a593Smuzhiyun epilog = and(kernel, #31); 251*4882a593Smuzhiyun ptr_out_p_32 = add(ptr_out, prolog); 252*4882a593Smuzhiyun over = and(over, #7); 253*4882a593Smuzhiyun} 254*4882a593Smuzhiyun{ 255*4882a593Smuzhiyun p3 = cmp.gtu(back, #8); 256*4882a593Smuzhiyun kernel = lsr(kernel, #5); 257*4882a593Smuzhiyun dcfetch(ptr_in_p_128); 258*4882a593Smuzhiyun ptr_in_p_128 = add(ptr_in_p_128, #32); 259*4882a593Smuzhiyun} 260*4882a593Smuzhiyun{ 261*4882a593Smuzhiyun p1 = cmp.eq(prolog, #0); 262*4882a593Smuzhiyun if(!p1.new) prolog = add(prolog, #1); 263*4882a593Smuzhiyun dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 264*4882a593Smuzhiyun ptr_in_p_128 = add(ptr_in_p_128, #32); 265*4882a593Smuzhiyun} 266*4882a593Smuzhiyun{ 267*4882a593Smuzhiyun nokernel = cmp.eq(kernel,#0); 268*4882a593Smuzhiyun dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 269*4882a593Smuzhiyun ptr_in_p_128 = add(ptr_in_p_128, #32); 270*4882a593Smuzhiyun shiftb = and(shift, #8); 271*4882a593Smuzhiyun} 272*4882a593Smuzhiyun{ 273*4882a593Smuzhiyun dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 274*4882a593Smuzhiyun ptr_in_p_128 = add(ptr_in_p_128, #32); 275*4882a593Smuzhiyun if(nokernel) jump .Lskip64; 276*4882a593Smuzhiyun p2 = cmp.eq(kernel, #1); /* skip ovr if kernel == 0 */ 277*4882a593Smuzhiyun} 278*4882a593Smuzhiyun{ 279*4882a593Smuzhiyun dczeroa(ptr_out_p_32); 280*4882a593Smuzhiyun /* don't advance pointer */ 281*4882a593Smuzhiyun if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32); 282*4882a593Smuzhiyun} 283*4882a593Smuzhiyun{ 284*4882a593Smuzhiyun dalign = and(dalign, #31); 285*4882a593Smuzhiyun dczeroa(ptr_out_p_32); 286*4882a593Smuzhiyun} 287*4882a593Smuzhiyun.Lskip64: 288*4882a593Smuzhiyun{ 289*4882a593Smuzhiyun data70 = memd(ptr_in++#16); 290*4882a593Smuzhiyun if(p3) dataF8 = memd(ptr_in+#8); 291*4882a593Smuzhiyun if(noprolog) jump .Lnoprolog32; 292*4882a593Smuzhiyun align = offset; 293*4882a593Smuzhiyun} 294*4882a593Smuzhiyun/* upto initial 7 bytes */ 295*4882a593Smuzhiyun{ 296*4882a593Smuzhiyun ldata0 = valignb(dataF8, data70, align); 297*4882a593Smuzhiyun ifbyte = tstbit(shift,#3); 298*4882a593Smuzhiyun offset = add(offset, star3); 299*4882a593Smuzhiyun} 300*4882a593Smuzhiyun{ 301*4882a593Smuzhiyun if(ifbyte) memb(ptr_out++#1) = data0; 302*4882a593Smuzhiyun ldata0 = lsr(ldata0, shiftb); 303*4882a593Smuzhiyun shiftb = and(shift, #16); 304*4882a593Smuzhiyun ifhword = tstbit(shift,#4); 305*4882a593Smuzhiyun} 306*4882a593Smuzhiyun{ 307*4882a593Smuzhiyun if(ifhword) memh(ptr_out++#2) = data0; 308*4882a593Smuzhiyun ldata0 = lsr(ldata0, shiftb); 309*4882a593Smuzhiyun ifword = tstbit(shift,#5); 310*4882a593Smuzhiyun p2 = cmp.gtu(offset, #7); 311*4882a593Smuzhiyun} 312*4882a593Smuzhiyun{ 313*4882a593Smuzhiyun if(ifword) memw(ptr_out++#4) = data0; 314*4882a593Smuzhiyun if(p2) data70 = dataF8; 315*4882a593Smuzhiyun if(p2) dataF8 = memd(ptr_in++#8); /* another 8 bytes */ 316*4882a593Smuzhiyun align = offset; 317*4882a593Smuzhiyun} 318*4882a593Smuzhiyun.Lnoprolog32: 319*4882a593Smuzhiyun{ 320*4882a593Smuzhiyun p3 = sp1loop0(.Ldword_loop_prolog, prolog) 321*4882a593Smuzhiyun rest = sub(len, star3); /* whats left after the loop */ 322*4882a593Smuzhiyun p0 = cmp.gt(over, #0); 323*4882a593Smuzhiyun} 324*4882a593Smuzhiyun if(p0) rest = add(rest, #16); 325*4882a593Smuzhiyun.Ldword_loop_prolog: 326*4882a593Smuzhiyun{ 327*4882a593Smuzhiyun if(p3) memd(ptr_out++#8) = ldata0; 328*4882a593Smuzhiyun ldata0 = valignb(dataF8, data70, align); 329*4882a593Smuzhiyun p0 = cmp.gt(rest, #16); 330*4882a593Smuzhiyun} 331*4882a593Smuzhiyun{ 332*4882a593Smuzhiyun data70 = dataF8; 333*4882a593Smuzhiyun if(p0) dataF8 = memd(ptr_in++#8); 334*4882a593Smuzhiyun rest = add(rest, #-8); 335*4882a593Smuzhiyun}:endloop0 336*4882a593Smuzhiyun.Lkernel: 337*4882a593Smuzhiyun{ 338*4882a593Smuzhiyun /* kernel is at least 32bytes */ 339*4882a593Smuzhiyun p3 = cmp.gtu(kernel, #0); 340*4882a593Smuzhiyun /* last itn. remove edge effects */ 341*4882a593Smuzhiyun if(p3.new) kernel = add(kernel, #-1); 342*4882a593Smuzhiyun /* dealt with in last dword loop */ 343*4882a593Smuzhiyun if(p3.new) epilog = add(epilog, #32); 344*4882a593Smuzhiyun} 345*4882a593Smuzhiyun{ 346*4882a593Smuzhiyun nokernel = cmp.eq(kernel, #0); /* after adjustment, recheck */ 347*4882a593Smuzhiyun if(nokernel.new) jump:NT .Lepilog; /* likely not taken */ 348*4882a593Smuzhiyun inc = combine(#32, #-1); 349*4882a593Smuzhiyun p3 = cmp.gtu(dalign, #24); 350*4882a593Smuzhiyun} 351*4882a593Smuzhiyun{ 352*4882a593Smuzhiyun if(p3) jump .Lodd_alignment; 353*4882a593Smuzhiyun} 354*4882a593Smuzhiyun{ 355*4882a593Smuzhiyun loop0(.Loword_loop_25to31, kernel); 356*4882a593Smuzhiyun kernel1 = cmp.gtu(kernel, #1); 357*4882a593Smuzhiyun rest = kernel; 358*4882a593Smuzhiyun} 359*4882a593Smuzhiyun .falign 360*4882a593Smuzhiyun.Loword_loop_25to31: 361*4882a593Smuzhiyun{ 362*4882a593Smuzhiyun dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 363*4882a593Smuzhiyun if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 364*4882a593Smuzhiyun} 365*4882a593Smuzhiyun{ 366*4882a593Smuzhiyun dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 367*4882a593Smuzhiyun p3 = cmp.eq(kernel, rest); 368*4882a593Smuzhiyun} 369*4882a593Smuzhiyun{ 370*4882a593Smuzhiyun /* kernel -= 1 */ 371*4882a593Smuzhiyun ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 372*4882a593Smuzhiyun /* kill write on first iteration */ 373*4882a593Smuzhiyun if(!p3) memd(ptr_out++#8) = ldata1; 374*4882a593Smuzhiyun ldata1 = valignb(dataF8, data70, align); 375*4882a593Smuzhiyun data70 = memd(ptr_in++#8); 376*4882a593Smuzhiyun} 377*4882a593Smuzhiyun{ 378*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 379*4882a593Smuzhiyun ldata0 = valignb(data70, dataF8, align); 380*4882a593Smuzhiyun dataF8 = memd(ptr_in++#8); 381*4882a593Smuzhiyun} 382*4882a593Smuzhiyun{ 383*4882a593Smuzhiyun memd(ptr_out++#8) = ldata1; 384*4882a593Smuzhiyun ldata1 = valignb(dataF8, data70, align); 385*4882a593Smuzhiyun data70 = memd(ptr_in++#8); 386*4882a593Smuzhiyun} 387*4882a593Smuzhiyun{ 388*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 389*4882a593Smuzhiyun ldata0 = valignb(data70, dataF8, align); 390*4882a593Smuzhiyun dataF8 = memd(ptr_in++#8); 391*4882a593Smuzhiyun kernel1 = cmp.gtu(kernel, #1); 392*4882a593Smuzhiyun}:endloop0 393*4882a593Smuzhiyun{ 394*4882a593Smuzhiyun memd(ptr_out++#8) = ldata1; 395*4882a593Smuzhiyun jump .Lepilog; 396*4882a593Smuzhiyun} 397*4882a593Smuzhiyun.Lodd_alignment: 398*4882a593Smuzhiyun{ 399*4882a593Smuzhiyun loop0(.Loword_loop_00to24, kernel); 400*4882a593Smuzhiyun kernel1 = cmp.gtu(kernel, #1); 401*4882a593Smuzhiyun rest = add(kernel, #-1); 402*4882a593Smuzhiyun} 403*4882a593Smuzhiyun .falign 404*4882a593Smuzhiyun.Loword_loop_00to24: 405*4882a593Smuzhiyun{ 406*4882a593Smuzhiyun dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 407*4882a593Smuzhiyun ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 408*4882a593Smuzhiyun if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 409*4882a593Smuzhiyun} 410*4882a593Smuzhiyun{ 411*4882a593Smuzhiyun dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 412*4882a593Smuzhiyun} 413*4882a593Smuzhiyun{ 414*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 415*4882a593Smuzhiyun ldata0 = valignb(dataF8, data70, align); 416*4882a593Smuzhiyun data70 = memd(ptr_in++#8); 417*4882a593Smuzhiyun} 418*4882a593Smuzhiyun{ 419*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 420*4882a593Smuzhiyun ldata0 = valignb(data70, dataF8, align); 421*4882a593Smuzhiyun dataF8 = memd(ptr_in++#8); 422*4882a593Smuzhiyun} 423*4882a593Smuzhiyun{ 424*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 425*4882a593Smuzhiyun ldata0 = valignb(dataF8, data70, align); 426*4882a593Smuzhiyun data70 = memd(ptr_in++#8); 427*4882a593Smuzhiyun} 428*4882a593Smuzhiyun{ 429*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 430*4882a593Smuzhiyun ldata0 = valignb(data70, dataF8, align); 431*4882a593Smuzhiyun dataF8 = memd(ptr_in++#8); 432*4882a593Smuzhiyun kernel1 = cmp.gtu(kernel, #1); 433*4882a593Smuzhiyun}:endloop0 434*4882a593Smuzhiyun.Lepilog: 435*4882a593Smuzhiyun{ 436*4882a593Smuzhiyun noepilog = cmp.eq(epilog,#0); 437*4882a593Smuzhiyun epilogdws = lsr(epilog, #3); 438*4882a593Smuzhiyun kernel = and(epilog, #7); 439*4882a593Smuzhiyun} 440*4882a593Smuzhiyun{ 441*4882a593Smuzhiyun if(noepilog) jumpr r31; 442*4882a593Smuzhiyun if(noepilog) ptr_out = sub(ptr_out, len); 443*4882a593Smuzhiyun p3 = cmp.eq(epilogdws, #0); 444*4882a593Smuzhiyun shift2 = asl(epilog, #3); 445*4882a593Smuzhiyun} 446*4882a593Smuzhiyun{ 447*4882a593Smuzhiyun shiftb = and(shift2, #32); 448*4882a593Smuzhiyun ifword = tstbit(epilog,#2); 449*4882a593Smuzhiyun if(p3) jump .Lepilog60; 450*4882a593Smuzhiyun if(!p3) epilog = add(epilog, #-16); 451*4882a593Smuzhiyun} 452*4882a593Smuzhiyun{ 453*4882a593Smuzhiyun loop0(.Ldword_loop_epilog, epilogdws); 454*4882a593Smuzhiyun /* stop criteria is lsbs unless = 0 then its 8 */ 455*4882a593Smuzhiyun p3 = cmp.eq(kernel, #0); 456*4882a593Smuzhiyun if(p3.new) kernel= #8; 457*4882a593Smuzhiyun p1 = cmp.gt(over, #0); 458*4882a593Smuzhiyun} 459*4882a593Smuzhiyun /* if not aligned to end of buffer execute 1 more iteration */ 460*4882a593Smuzhiyun if(p1) kernel= #0; 461*4882a593Smuzhiyun.Ldword_loop_epilog: 462*4882a593Smuzhiyun{ 463*4882a593Smuzhiyun memd(ptr_out++#8) = ldata0; 464*4882a593Smuzhiyun ldata0 = valignb(dataF8, data70, align); 465*4882a593Smuzhiyun p3 = cmp.gt(epilog, kernel); 466*4882a593Smuzhiyun} 467*4882a593Smuzhiyun{ 468*4882a593Smuzhiyun data70 = dataF8; 469*4882a593Smuzhiyun if(p3) dataF8 = memd(ptr_in++#8); 470*4882a593Smuzhiyun epilog = add(epilog, #-8); 471*4882a593Smuzhiyun}:endloop0 472*4882a593Smuzhiyun/* copy last 7 bytes */ 473*4882a593Smuzhiyun.Lepilog60: 474*4882a593Smuzhiyun{ 475*4882a593Smuzhiyun if(ifword) memw(ptr_out++#4) = data0; 476*4882a593Smuzhiyun ldata0 = lsr(ldata0, shiftb); 477*4882a593Smuzhiyun ifhword = tstbit(epilog,#1); 478*4882a593Smuzhiyun shiftb = and(shift2, #16); 479*4882a593Smuzhiyun} 480*4882a593Smuzhiyun{ 481*4882a593Smuzhiyun if(ifhword) memh(ptr_out++#2) = data0; 482*4882a593Smuzhiyun ldata0 = lsr(ldata0, shiftb); 483*4882a593Smuzhiyun ifbyte = tstbit(epilog,#0); 484*4882a593Smuzhiyun if(ifbyte.new) len = add(len, #-1); 485*4882a593Smuzhiyun} 486*4882a593Smuzhiyun{ 487*4882a593Smuzhiyun if(ifbyte) memb(ptr_out) = data0; 488*4882a593Smuzhiyun ptr_out = sub(ptr_out, len); /* return dest pointer */ 489*4882a593Smuzhiyun jumpr r31; 490*4882a593Smuzhiyun} 491*4882a593Smuzhiyun/* do byte copy for small n */ 492*4882a593Smuzhiyun.Lbytes23orless: 493*4882a593Smuzhiyun{ 494*4882a593Smuzhiyun p3 = sp1loop0(.Lbyte_copy, len); 495*4882a593Smuzhiyun len = add(len, #-1); 496*4882a593Smuzhiyun} 497*4882a593Smuzhiyun.Lbyte_copy: 498*4882a593Smuzhiyun{ 499*4882a593Smuzhiyun data0 = memb(ptr_in++#1); 500*4882a593Smuzhiyun if(p3) memb(ptr_out++#1) = data0; 501*4882a593Smuzhiyun}:endloop0 502*4882a593Smuzhiyun{ 503*4882a593Smuzhiyun memb(ptr_out) = data0; 504*4882a593Smuzhiyun ptr_out = sub(ptr_out, len); 505*4882a593Smuzhiyun jumpr r31; 506*4882a593Smuzhiyun} 507*4882a593Smuzhiyun/* do dword copies for aligned in, out and length */ 508*4882a593Smuzhiyun.Ldwordaligned: 509*4882a593Smuzhiyun{ 510*4882a593Smuzhiyun p3 = sp1loop0(.Ldword_copy, len8); 511*4882a593Smuzhiyun} 512*4882a593Smuzhiyun.Ldword_copy: 513*4882a593Smuzhiyun{ 514*4882a593Smuzhiyun if(p3) memd(ptr_out++#8) = ldata0; 515*4882a593Smuzhiyun ldata0 = memd(ptr_in++#8); 516*4882a593Smuzhiyun}:endloop0 517*4882a593Smuzhiyun{ 518*4882a593Smuzhiyun memd(ptr_out) = ldata0; 519*4882a593Smuzhiyun ptr_out = sub(ptr_out, len); 520*4882a593Smuzhiyun jumpr r31; /* return to function caller */ 521*4882a593Smuzhiyun} 522*4882a593Smuzhiyun.Lmemcpy_return: 523*4882a593Smuzhiyun r21:20 = memd(sp+#16); /* restore r20+r21 */ 524*4882a593Smuzhiyun{ 525*4882a593Smuzhiyun r25:24 = memd(sp+#8); /* restore r24+r25 */ 526*4882a593Smuzhiyun r17:16 = memd(sp+#0); /* restore r16+r17 */ 527*4882a593Smuzhiyun} 528*4882a593Smuzhiyun deallocframe; /* restore r31 and incrment stack by 16 */ 529*4882a593Smuzhiyun jumpr r31 530