1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) 5*4882a593Smuzhiyun * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#ifdef __KERNEL__ 9*4882a593Smuzhiyun#include <linux/linkage.h> 10*4882a593Smuzhiyun#include <asm/visasm.h> 11*4882a593Smuzhiyun#include <asm/asi.h> 12*4882a593Smuzhiyun#include <asm/export.h> 13*4882a593Smuzhiyun#define GLOBAL_SPARE g7 14*4882a593Smuzhiyun#else 15*4882a593Smuzhiyun#define GLOBAL_SPARE g5 16*4882a593Smuzhiyun#define ASI_BLK_P 0xf0 17*4882a593Smuzhiyun#define FPRS_FEF 0x04 18*4882a593Smuzhiyun#ifdef MEMCPY_DEBUG 19*4882a593Smuzhiyun#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 20*4882a593Smuzhiyun clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 21*4882a593Smuzhiyun#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 22*4882a593Smuzhiyun#else 23*4882a593Smuzhiyun#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 24*4882a593Smuzhiyun#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 25*4882a593Smuzhiyun#endif 26*4882a593Smuzhiyun#endif 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#ifndef EX_LD 29*4882a593Smuzhiyun#define EX_LD(x,y) x 30*4882a593Smuzhiyun#endif 31*4882a593Smuzhiyun#ifndef EX_LD_FP 32*4882a593Smuzhiyun#define EX_LD_FP(x,y) x 33*4882a593Smuzhiyun#endif 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#ifndef EX_ST 36*4882a593Smuzhiyun#define EX_ST(x,y) x 37*4882a593Smuzhiyun#endif 38*4882a593Smuzhiyun#ifndef EX_ST_FP 39*4882a593Smuzhiyun#define EX_ST_FP(x,y) x 40*4882a593Smuzhiyun#endif 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun#ifndef LOAD 43*4882a593Smuzhiyun#define LOAD(type,addr,dest) type [addr], dest 44*4882a593Smuzhiyun#endif 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun#ifndef LOAD_BLK 47*4882a593Smuzhiyun#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest 48*4882a593Smuzhiyun#endif 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun#ifndef STORE 51*4882a593Smuzhiyun#define STORE(type,src,addr) type src, [addr] 52*4882a593Smuzhiyun#endif 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun#ifndef STORE_BLK 55*4882a593Smuzhiyun#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 56*4882a593Smuzhiyun#endif 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun#ifndef FUNC_NAME 59*4882a593Smuzhiyun#define FUNC_NAME memcpy 60*4882a593Smuzhiyun#endif 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun#ifndef PREAMBLE 63*4882a593Smuzhiyun#define PREAMBLE 64*4882a593Smuzhiyun#endif 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun#ifndef XCC 67*4882a593Smuzhiyun#define XCC xcc 68*4882a593Smuzhiyun#endif 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ 71*4882a593Smuzhiyun faligndata %f1, %f2, %f48; \ 72*4882a593Smuzhiyun faligndata %f2, %f3, %f50; \ 73*4882a593Smuzhiyun faligndata %f3, %f4, %f52; \ 74*4882a593Smuzhiyun faligndata %f4, %f5, %f54; \ 75*4882a593Smuzhiyun faligndata %f5, %f6, %f56; \ 76*4882a593Smuzhiyun faligndata %f6, %f7, %f58; \ 77*4882a593Smuzhiyun faligndata %f7, %f8, %f60; \ 78*4882a593Smuzhiyun faligndata %f8, %f9, %f62; 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, jmptgt) \ 81*4882a593Smuzhiyun EX_LD_FP(LOAD_BLK(%src, %fdest), U1_gs_80_fp); \ 82*4882a593Smuzhiyun EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp); \ 83*4882a593Smuzhiyun add %src, 0x40, %src; \ 84*4882a593Smuzhiyun subcc %GLOBAL_SPARE, 0x40, %GLOBAL_SPARE; \ 85*4882a593Smuzhiyun be,pn %xcc, jmptgt; \ 86*4882a593Smuzhiyun add %dest, 0x40, %dest; \ 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun#define LOOP_CHUNK1(src, dest, branch_dest) \ 89*4882a593Smuzhiyun MAIN_LOOP_CHUNK(src, dest, f0, f48, branch_dest) 90*4882a593Smuzhiyun#define LOOP_CHUNK2(src, dest, branch_dest) \ 91*4882a593Smuzhiyun MAIN_LOOP_CHUNK(src, dest, f16, f48, branch_dest) 92*4882a593Smuzhiyun#define LOOP_CHUNK3(src, dest, branch_dest) \ 93*4882a593Smuzhiyun MAIN_LOOP_CHUNK(src, dest, f32, f48, branch_dest) 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun#define DO_SYNC membar #Sync; 96*4882a593Smuzhiyun#define STORE_SYNC(dest, fsrc) \ 97*4882a593Smuzhiyun EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp); \ 98*4882a593Smuzhiyun add %dest, 0x40, %dest; \ 99*4882a593Smuzhiyun DO_SYNC 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun#define STORE_JUMP(dest, fsrc, target) \ 102*4882a593Smuzhiyun EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_40_fp); \ 103*4882a593Smuzhiyun add %dest, 0x40, %dest; \ 104*4882a593Smuzhiyun ba,pt %xcc, target; \ 105*4882a593Smuzhiyun nop; 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun#define FINISH_VISCHUNK(dest, f0, f1) \ 108*4882a593Smuzhiyun subcc %g3, 8, %g3; \ 109*4882a593Smuzhiyun bl,pn %xcc, 95f; \ 110*4882a593Smuzhiyun faligndata %f0, %f1, %f48; \ 111*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f48, %dest), U1_g3_8_fp); \ 112*4882a593Smuzhiyun add %dest, 8, %dest; 113*4882a593Smuzhiyun 114*4882a593Smuzhiyun#define UNEVEN_VISCHUNK_LAST(dest, f0, f1) \ 115*4882a593Smuzhiyun subcc %g3, 8, %g3; \ 116*4882a593Smuzhiyun bl,pn %xcc, 95f; \ 117*4882a593Smuzhiyun fsrc2 %f0, %f1; 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun#define UNEVEN_VISCHUNK(dest, f0, f1) \ 120*4882a593Smuzhiyun UNEVEN_VISCHUNK_LAST(dest, f0, f1) \ 121*4882a593Smuzhiyun ba,a,pt %xcc, 93f; 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun .register %g2,#scratch 124*4882a593Smuzhiyun .register %g3,#scratch 125*4882a593Smuzhiyun 126*4882a593Smuzhiyun .text 127*4882a593Smuzhiyun#ifndef EX_RETVAL 128*4882a593Smuzhiyun#define EX_RETVAL(x) x 129*4882a593SmuzhiyunENTRY(U1_g1_1_fp) 130*4882a593Smuzhiyun VISExitHalf 131*4882a593Smuzhiyun add %g1, 1, %g1 132*4882a593Smuzhiyun add %g1, %g2, %g1 133*4882a593Smuzhiyun retl 134*4882a593Smuzhiyun add %g1, %o2, %o0 135*4882a593SmuzhiyunENDPROC(U1_g1_1_fp) 136*4882a593SmuzhiyunENTRY(U1_g2_0_fp) 137*4882a593Smuzhiyun VISExitHalf 138*4882a593Smuzhiyun retl 139*4882a593Smuzhiyun add %g2, %o2, %o0 140*4882a593SmuzhiyunENDPROC(U1_g2_0_fp) 141*4882a593SmuzhiyunENTRY(U1_g2_8_fp) 142*4882a593Smuzhiyun VISExitHalf 143*4882a593Smuzhiyun add %g2, 8, %g2 144*4882a593Smuzhiyun retl 145*4882a593Smuzhiyun add %g2, %o2, %o0 146*4882a593SmuzhiyunENDPROC(U1_g2_8_fp) 147*4882a593SmuzhiyunENTRY(U1_gs_0_fp) 148*4882a593Smuzhiyun VISExitHalf 149*4882a593Smuzhiyun add %GLOBAL_SPARE, %g3, %o0 150*4882a593Smuzhiyun retl 151*4882a593Smuzhiyun add %o0, %o2, %o0 152*4882a593SmuzhiyunENDPROC(U1_gs_0_fp) 153*4882a593SmuzhiyunENTRY(U1_gs_80_fp) 154*4882a593Smuzhiyun VISExitHalf 155*4882a593Smuzhiyun add %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE 156*4882a593Smuzhiyun add %GLOBAL_SPARE, %g3, %o0 157*4882a593Smuzhiyun retl 158*4882a593Smuzhiyun add %o0, %o2, %o0 159*4882a593SmuzhiyunENDPROC(U1_gs_80_fp) 160*4882a593SmuzhiyunENTRY(U1_gs_40_fp) 161*4882a593Smuzhiyun VISExitHalf 162*4882a593Smuzhiyun add %GLOBAL_SPARE, 0x40, %GLOBAL_SPARE 163*4882a593Smuzhiyun add %GLOBAL_SPARE, %g3, %o0 164*4882a593Smuzhiyun retl 165*4882a593Smuzhiyun add %o0, %o2, %o0 166*4882a593SmuzhiyunENDPROC(U1_gs_40_fp) 167*4882a593SmuzhiyunENTRY(U1_g3_0_fp) 168*4882a593Smuzhiyun VISExitHalf 169*4882a593Smuzhiyun retl 170*4882a593Smuzhiyun add %g3, %o2, %o0 171*4882a593SmuzhiyunENDPROC(U1_g3_0_fp) 172*4882a593SmuzhiyunENTRY(U1_g3_8_fp) 173*4882a593Smuzhiyun VISExitHalf 174*4882a593Smuzhiyun add %g3, 8, %g3 175*4882a593Smuzhiyun retl 176*4882a593Smuzhiyun add %g3, %o2, %o0 177*4882a593SmuzhiyunENDPROC(U1_g3_8_fp) 178*4882a593SmuzhiyunENTRY(U1_o2_0_fp) 179*4882a593Smuzhiyun VISExitHalf 180*4882a593Smuzhiyun retl 181*4882a593Smuzhiyun mov %o2, %o0 182*4882a593SmuzhiyunENDPROC(U1_o2_0_fp) 183*4882a593SmuzhiyunENTRY(U1_o2_1_fp) 184*4882a593Smuzhiyun VISExitHalf 185*4882a593Smuzhiyun retl 186*4882a593Smuzhiyun add %o2, 1, %o0 187*4882a593SmuzhiyunENDPROC(U1_o2_1_fp) 188*4882a593SmuzhiyunENTRY(U1_gs_0) 189*4882a593Smuzhiyun VISExitHalf 190*4882a593Smuzhiyun retl 191*4882a593Smuzhiyun add %GLOBAL_SPARE, %o2, %o0 192*4882a593SmuzhiyunENDPROC(U1_gs_0) 193*4882a593SmuzhiyunENTRY(U1_gs_8) 194*4882a593Smuzhiyun VISExitHalf 195*4882a593Smuzhiyun add %GLOBAL_SPARE, %o2, %GLOBAL_SPARE 196*4882a593Smuzhiyun retl 197*4882a593Smuzhiyun add %GLOBAL_SPARE, 0x8, %o0 198*4882a593SmuzhiyunENDPROC(U1_gs_8) 199*4882a593SmuzhiyunENTRY(U1_gs_10) 200*4882a593Smuzhiyun VISExitHalf 201*4882a593Smuzhiyun add %GLOBAL_SPARE, %o2, %GLOBAL_SPARE 202*4882a593Smuzhiyun retl 203*4882a593Smuzhiyun add %GLOBAL_SPARE, 0x10, %o0 204*4882a593SmuzhiyunENDPROC(U1_gs_10) 205*4882a593SmuzhiyunENTRY(U1_o2_0) 206*4882a593Smuzhiyun retl 207*4882a593Smuzhiyun mov %o2, %o0 208*4882a593SmuzhiyunENDPROC(U1_o2_0) 209*4882a593SmuzhiyunENTRY(U1_o2_8) 210*4882a593Smuzhiyun retl 211*4882a593Smuzhiyun add %o2, 8, %o0 212*4882a593SmuzhiyunENDPROC(U1_o2_8) 213*4882a593SmuzhiyunENTRY(U1_o2_4) 214*4882a593Smuzhiyun retl 215*4882a593Smuzhiyun add %o2, 4, %o0 216*4882a593SmuzhiyunENDPROC(U1_o2_4) 217*4882a593SmuzhiyunENTRY(U1_o2_1) 218*4882a593Smuzhiyun retl 219*4882a593Smuzhiyun add %o2, 1, %o0 220*4882a593SmuzhiyunENDPROC(U1_o2_1) 221*4882a593SmuzhiyunENTRY(U1_g1_0) 222*4882a593Smuzhiyun retl 223*4882a593Smuzhiyun add %g1, %o2, %o0 224*4882a593SmuzhiyunENDPROC(U1_g1_0) 225*4882a593SmuzhiyunENTRY(U1_g1_1) 226*4882a593Smuzhiyun add %g1, 1, %g1 227*4882a593Smuzhiyun retl 228*4882a593Smuzhiyun add %g1, %o2, %o0 229*4882a593SmuzhiyunENDPROC(U1_g1_1) 230*4882a593SmuzhiyunENTRY(U1_gs_0_o2_adj) 231*4882a593Smuzhiyun and %o2, 7, %o2 232*4882a593Smuzhiyun retl 233*4882a593Smuzhiyun add %GLOBAL_SPARE, %o2, %o0 234*4882a593SmuzhiyunENDPROC(U1_gs_0_o2_adj) 235*4882a593SmuzhiyunENTRY(U1_gs_8_o2_adj) 236*4882a593Smuzhiyun and %o2, 7, %o2 237*4882a593Smuzhiyun add %GLOBAL_SPARE, 8, %GLOBAL_SPARE 238*4882a593Smuzhiyun retl 239*4882a593Smuzhiyun add %GLOBAL_SPARE, %o2, %o0 240*4882a593SmuzhiyunENDPROC(U1_gs_8_o2_adj) 241*4882a593Smuzhiyun#endif 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun .align 64 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun .globl FUNC_NAME 246*4882a593Smuzhiyun .type FUNC_NAME,#function 247*4882a593SmuzhiyunFUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 248*4882a593Smuzhiyun srlx %o2, 31, %g2 249*4882a593Smuzhiyun cmp %g2, 0 250*4882a593Smuzhiyun tne %xcc, 5 251*4882a593Smuzhiyun PREAMBLE 252*4882a593Smuzhiyun mov %o0, %o4 253*4882a593Smuzhiyun cmp %o2, 0 254*4882a593Smuzhiyun be,pn %XCC, 85f 255*4882a593Smuzhiyun or %o0, %o1, %o3 256*4882a593Smuzhiyun cmp %o2, 16 257*4882a593Smuzhiyun blu,a,pn %XCC, 80f 258*4882a593Smuzhiyun or %o3, %o2, %o3 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun cmp %o2, (5 * 64) 261*4882a593Smuzhiyun blu,pt %XCC, 70f 262*4882a593Smuzhiyun andcc %o3, 0x7, %g0 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ 265*4882a593Smuzhiyun VISEntry 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun /* Is 'dst' already aligned on an 64-byte boundary? */ 268*4882a593Smuzhiyun andcc %o0, 0x3f, %g2 269*4882a593Smuzhiyun be,pt %XCC, 2f 270*4882a593Smuzhiyun 271*4882a593Smuzhiyun /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 272*4882a593Smuzhiyun * of bytes to copy to make 'dst' 64-byte aligned. We pre- 273*4882a593Smuzhiyun * subtract this from 'len'. 274*4882a593Smuzhiyun */ 275*4882a593Smuzhiyun sub %o0, %o1, %GLOBAL_SPARE 276*4882a593Smuzhiyun sub %g2, 0x40, %g2 277*4882a593Smuzhiyun sub %g0, %g2, %g2 278*4882a593Smuzhiyun sub %o2, %g2, %o2 279*4882a593Smuzhiyun andcc %g2, 0x7, %g1 280*4882a593Smuzhiyun be,pt %icc, 2f 281*4882a593Smuzhiyun and %g2, 0x38, %g2 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun1: subcc %g1, 0x1, %g1 284*4882a593Smuzhiyun EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U1_g1_1_fp) 285*4882a593Smuzhiyun EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE), U1_g1_1_fp) 286*4882a593Smuzhiyun bgu,pt %XCC, 1b 287*4882a593Smuzhiyun add %o1, 0x1, %o1 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun add %o1, %GLOBAL_SPARE, %o0 290*4882a593Smuzhiyun 291*4882a593Smuzhiyun2: cmp %g2, 0x0 292*4882a593Smuzhiyun and %o1, 0x7, %g1 293*4882a593Smuzhiyun be,pt %icc, 3f 294*4882a593Smuzhiyun alignaddr %o1, %g0, %o1 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1, %f4), U1_g2_0_fp) 297*4882a593Smuzhiyun1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U1_g2_0_fp) 298*4882a593Smuzhiyun add %o1, 0x8, %o1 299*4882a593Smuzhiyun subcc %g2, 0x8, %g2 300*4882a593Smuzhiyun faligndata %f4, %f6, %f0 301*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp) 302*4882a593Smuzhiyun be,pn %icc, 3f 303*4882a593Smuzhiyun add %o0, 0x8, %o0 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U1_g2_0_fp) 306*4882a593Smuzhiyun add %o1, 0x8, %o1 307*4882a593Smuzhiyun subcc %g2, 0x8, %g2 308*4882a593Smuzhiyun faligndata %f6, %f4, %f0 309*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp) 310*4882a593Smuzhiyun bne,pt %icc, 1b 311*4882a593Smuzhiyun add %o0, 0x8, %o0 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun /* Destination is 64-byte aligned. */ 314*4882a593Smuzhiyun3: 315*4882a593Smuzhiyun membar #LoadStore | #StoreStore | #StoreLoad 316*4882a593Smuzhiyun 317*4882a593Smuzhiyun subcc %o2, 0x40, %GLOBAL_SPARE 318*4882a593Smuzhiyun add %o1, %g1, %g1 319*4882a593Smuzhiyun andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE 320*4882a593Smuzhiyun srl %g1, 3, %g2 321*4882a593Smuzhiyun sub %o2, %GLOBAL_SPARE, %g3 322*4882a593Smuzhiyun andn %o1, (0x40 - 1), %o1 323*4882a593Smuzhiyun and %g2, 7, %g2 324*4882a593Smuzhiyun andncc %g3, 0x7, %g3 325*4882a593Smuzhiyun fsrc2 %f0, %f2 326*4882a593Smuzhiyun sub %g3, 0x8, %g3 327*4882a593Smuzhiyun sub %o2, %GLOBAL_SPARE, %o2 328*4882a593Smuzhiyun 329*4882a593Smuzhiyun add %g1, %GLOBAL_SPARE, %g1 330*4882a593Smuzhiyun subcc %o2, %g3, %o2 331*4882a593Smuzhiyun 332*4882a593Smuzhiyun EX_LD_FP(LOAD_BLK(%o1, %f0), U1_gs_0_fp) 333*4882a593Smuzhiyun add %o1, 0x40, %o1 334*4882a593Smuzhiyun add %g1, %g3, %g1 335*4882a593Smuzhiyun EX_LD_FP(LOAD_BLK(%o1, %f16), U1_gs_0_fp) 336*4882a593Smuzhiyun add %o1, 0x40, %o1 337*4882a593Smuzhiyun sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE 338*4882a593Smuzhiyun EX_LD_FP(LOAD_BLK(%o1, %f32), U1_gs_80_fp) 339*4882a593Smuzhiyun add %o1, 0x40, %o1 340*4882a593Smuzhiyun 341*4882a593Smuzhiyun /* There are 8 instances of the unrolled loop, 342*4882a593Smuzhiyun * one for each possible alignment of the 343*4882a593Smuzhiyun * source buffer. Each loop instance is 452 344*4882a593Smuzhiyun * bytes. 345*4882a593Smuzhiyun */ 346*4882a593Smuzhiyun sll %g2, 3, %o3 347*4882a593Smuzhiyun sub %o3, %g2, %o3 348*4882a593Smuzhiyun sllx %o3, 4, %o3 349*4882a593Smuzhiyun add %o3, %g2, %o3 350*4882a593Smuzhiyun sllx %o3, 2, %g2 351*4882a593Smuzhiyun1: rd %pc, %o3 352*4882a593Smuzhiyun add %o3, %lo(1f - 1b), %o3 353*4882a593Smuzhiyun jmpl %o3 + %g2, %g0 354*4882a593Smuzhiyun nop 355*4882a593Smuzhiyun 356*4882a593Smuzhiyun .align 64 357*4882a593Smuzhiyun1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 358*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 359*4882a593Smuzhiyun FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 360*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 361*4882a593Smuzhiyun FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 362*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 363*4882a593Smuzhiyun ba,pt %xcc, 1b+4 364*4882a593Smuzhiyun faligndata %f0, %f2, %f48 365*4882a593Smuzhiyun1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 366*4882a593Smuzhiyun STORE_SYNC(o0, f48) 367*4882a593Smuzhiyun FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 368*4882a593Smuzhiyun STORE_JUMP(o0, f48, 40f) 369*4882a593Smuzhiyun2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 370*4882a593Smuzhiyun STORE_SYNC(o0, f48) 371*4882a593Smuzhiyun FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 372*4882a593Smuzhiyun STORE_JUMP(o0, f48, 48f) 373*4882a593Smuzhiyun3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 374*4882a593Smuzhiyun STORE_SYNC(o0, f48) 375*4882a593Smuzhiyun FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 376*4882a593Smuzhiyun STORE_JUMP(o0, f48, 56f) 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 379*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 380*4882a593Smuzhiyun FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 381*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 382*4882a593Smuzhiyun FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 383*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 384*4882a593Smuzhiyun ba,pt %xcc, 1b+4 385*4882a593Smuzhiyun faligndata %f2, %f4, %f48 386*4882a593Smuzhiyun1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 387*4882a593Smuzhiyun STORE_SYNC(o0, f48) 388*4882a593Smuzhiyun FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 389*4882a593Smuzhiyun STORE_JUMP(o0, f48, 41f) 390*4882a593Smuzhiyun2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 391*4882a593Smuzhiyun STORE_SYNC(o0, f48) 392*4882a593Smuzhiyun FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 393*4882a593Smuzhiyun STORE_JUMP(o0, f48, 49f) 394*4882a593Smuzhiyun3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 395*4882a593Smuzhiyun STORE_SYNC(o0, f48) 396*4882a593Smuzhiyun FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 397*4882a593Smuzhiyun STORE_JUMP(o0, f48, 57f) 398*4882a593Smuzhiyun 399*4882a593Smuzhiyun1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 400*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 401*4882a593Smuzhiyun FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 402*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 403*4882a593Smuzhiyun FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 404*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 405*4882a593Smuzhiyun ba,pt %xcc, 1b+4 406*4882a593Smuzhiyun faligndata %f4, %f6, %f48 407*4882a593Smuzhiyun1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 408*4882a593Smuzhiyun STORE_SYNC(o0, f48) 409*4882a593Smuzhiyun FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 410*4882a593Smuzhiyun STORE_JUMP(o0, f48, 42f) 411*4882a593Smuzhiyun2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 412*4882a593Smuzhiyun STORE_SYNC(o0, f48) 413*4882a593Smuzhiyun FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 414*4882a593Smuzhiyun STORE_JUMP(o0, f48, 50f) 415*4882a593Smuzhiyun3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 416*4882a593Smuzhiyun STORE_SYNC(o0, f48) 417*4882a593Smuzhiyun FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 418*4882a593Smuzhiyun STORE_JUMP(o0, f48, 58f) 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 421*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 422*4882a593Smuzhiyun FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 423*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 424*4882a593Smuzhiyun FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 425*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 426*4882a593Smuzhiyun ba,pt %xcc, 1b+4 427*4882a593Smuzhiyun faligndata %f6, %f8, %f48 428*4882a593Smuzhiyun1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 429*4882a593Smuzhiyun STORE_SYNC(o0, f48) 430*4882a593Smuzhiyun FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 431*4882a593Smuzhiyun STORE_JUMP(o0, f48, 43f) 432*4882a593Smuzhiyun2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 433*4882a593Smuzhiyun STORE_SYNC(o0, f48) 434*4882a593Smuzhiyun FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 435*4882a593Smuzhiyun STORE_JUMP(o0, f48, 51f) 436*4882a593Smuzhiyun3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 437*4882a593Smuzhiyun STORE_SYNC(o0, f48) 438*4882a593Smuzhiyun FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 439*4882a593Smuzhiyun STORE_JUMP(o0, f48, 59f) 440*4882a593Smuzhiyun 441*4882a593Smuzhiyun1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 442*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 443*4882a593Smuzhiyun FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 444*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 445*4882a593Smuzhiyun FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 446*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 447*4882a593Smuzhiyun ba,pt %xcc, 1b+4 448*4882a593Smuzhiyun faligndata %f8, %f10, %f48 449*4882a593Smuzhiyun1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 450*4882a593Smuzhiyun STORE_SYNC(o0, f48) 451*4882a593Smuzhiyun FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 452*4882a593Smuzhiyun STORE_JUMP(o0, f48, 44f) 453*4882a593Smuzhiyun2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 454*4882a593Smuzhiyun STORE_SYNC(o0, f48) 455*4882a593Smuzhiyun FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 456*4882a593Smuzhiyun STORE_JUMP(o0, f48, 52f) 457*4882a593Smuzhiyun3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 458*4882a593Smuzhiyun STORE_SYNC(o0, f48) 459*4882a593Smuzhiyun FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 460*4882a593Smuzhiyun STORE_JUMP(o0, f48, 60f) 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 463*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 464*4882a593Smuzhiyun FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 465*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 466*4882a593Smuzhiyun FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 467*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 468*4882a593Smuzhiyun ba,pt %xcc, 1b+4 469*4882a593Smuzhiyun faligndata %f10, %f12, %f48 470*4882a593Smuzhiyun1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 471*4882a593Smuzhiyun STORE_SYNC(o0, f48) 472*4882a593Smuzhiyun FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 473*4882a593Smuzhiyun STORE_JUMP(o0, f48, 45f) 474*4882a593Smuzhiyun2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 475*4882a593Smuzhiyun STORE_SYNC(o0, f48) 476*4882a593Smuzhiyun FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 477*4882a593Smuzhiyun STORE_JUMP(o0, f48, 53f) 478*4882a593Smuzhiyun3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 479*4882a593Smuzhiyun STORE_SYNC(o0, f48) 480*4882a593Smuzhiyun FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 481*4882a593Smuzhiyun STORE_JUMP(o0, f48, 61f) 482*4882a593Smuzhiyun 483*4882a593Smuzhiyun1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 484*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 485*4882a593Smuzhiyun FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 486*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 487*4882a593Smuzhiyun FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 488*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 489*4882a593Smuzhiyun ba,pt %xcc, 1b+4 490*4882a593Smuzhiyun faligndata %f12, %f14, %f48 491*4882a593Smuzhiyun1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 492*4882a593Smuzhiyun STORE_SYNC(o0, f48) 493*4882a593Smuzhiyun FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 494*4882a593Smuzhiyun STORE_JUMP(o0, f48, 46f) 495*4882a593Smuzhiyun2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 496*4882a593Smuzhiyun STORE_SYNC(o0, f48) 497*4882a593Smuzhiyun FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 498*4882a593Smuzhiyun STORE_JUMP(o0, f48, 54f) 499*4882a593Smuzhiyun3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 500*4882a593Smuzhiyun STORE_SYNC(o0, f48) 501*4882a593Smuzhiyun FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 502*4882a593Smuzhiyun STORE_JUMP(o0, f48, 62f) 503*4882a593Smuzhiyun 504*4882a593Smuzhiyun1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 505*4882a593Smuzhiyun LOOP_CHUNK1(o1, o0, 1f) 506*4882a593Smuzhiyun FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 507*4882a593Smuzhiyun LOOP_CHUNK2(o1, o0, 2f) 508*4882a593Smuzhiyun FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 509*4882a593Smuzhiyun LOOP_CHUNK3(o1, o0, 3f) 510*4882a593Smuzhiyun ba,pt %xcc, 1b+4 511*4882a593Smuzhiyun faligndata %f14, %f16, %f48 512*4882a593Smuzhiyun1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 513*4882a593Smuzhiyun STORE_SYNC(o0, f48) 514*4882a593Smuzhiyun FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 515*4882a593Smuzhiyun STORE_JUMP(o0, f48, 47f) 516*4882a593Smuzhiyun2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 517*4882a593Smuzhiyun STORE_SYNC(o0, f48) 518*4882a593Smuzhiyun FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 519*4882a593Smuzhiyun STORE_JUMP(o0, f48, 55f) 520*4882a593Smuzhiyun3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 521*4882a593Smuzhiyun STORE_SYNC(o0, f48) 522*4882a593Smuzhiyun FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 523*4882a593Smuzhiyun STORE_JUMP(o0, f48, 63f) 524*4882a593Smuzhiyun 525*4882a593Smuzhiyun40: FINISH_VISCHUNK(o0, f0, f2) 526*4882a593Smuzhiyun41: FINISH_VISCHUNK(o0, f2, f4) 527*4882a593Smuzhiyun42: FINISH_VISCHUNK(o0, f4, f6) 528*4882a593Smuzhiyun43: FINISH_VISCHUNK(o0, f6, f8) 529*4882a593Smuzhiyun44: FINISH_VISCHUNK(o0, f8, f10) 530*4882a593Smuzhiyun45: FINISH_VISCHUNK(o0, f10, f12) 531*4882a593Smuzhiyun46: FINISH_VISCHUNK(o0, f12, f14) 532*4882a593Smuzhiyun47: UNEVEN_VISCHUNK(o0, f14, f0) 533*4882a593Smuzhiyun48: FINISH_VISCHUNK(o0, f16, f18) 534*4882a593Smuzhiyun49: FINISH_VISCHUNK(o0, f18, f20) 535*4882a593Smuzhiyun50: FINISH_VISCHUNK(o0, f20, f22) 536*4882a593Smuzhiyun51: FINISH_VISCHUNK(o0, f22, f24) 537*4882a593Smuzhiyun52: FINISH_VISCHUNK(o0, f24, f26) 538*4882a593Smuzhiyun53: FINISH_VISCHUNK(o0, f26, f28) 539*4882a593Smuzhiyun54: FINISH_VISCHUNK(o0, f28, f30) 540*4882a593Smuzhiyun55: UNEVEN_VISCHUNK(o0, f30, f0) 541*4882a593Smuzhiyun56: FINISH_VISCHUNK(o0, f32, f34) 542*4882a593Smuzhiyun57: FINISH_VISCHUNK(o0, f34, f36) 543*4882a593Smuzhiyun58: FINISH_VISCHUNK(o0, f36, f38) 544*4882a593Smuzhiyun59: FINISH_VISCHUNK(o0, f38, f40) 545*4882a593Smuzhiyun60: FINISH_VISCHUNK(o0, f40, f42) 546*4882a593Smuzhiyun61: FINISH_VISCHUNK(o0, f42, f44) 547*4882a593Smuzhiyun62: FINISH_VISCHUNK(o0, f44, f46) 548*4882a593Smuzhiyun63: UNEVEN_VISCHUNK_LAST(o0, f46, f0) 549*4882a593Smuzhiyun 550*4882a593Smuzhiyun93: EX_LD_FP(LOAD(ldd, %o1, %f2), U1_g3_0_fp) 551*4882a593Smuzhiyun add %o1, 8, %o1 552*4882a593Smuzhiyun subcc %g3, 8, %g3 553*4882a593Smuzhiyun faligndata %f0, %f2, %f8 554*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp) 555*4882a593Smuzhiyun bl,pn %xcc, 95f 556*4882a593Smuzhiyun add %o0, 8, %o0 557*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1, %f0), U1_g3_0_fp) 558*4882a593Smuzhiyun add %o1, 8, %o1 559*4882a593Smuzhiyun subcc %g3, 8, %g3 560*4882a593Smuzhiyun faligndata %f2, %f0, %f8 561*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp) 562*4882a593Smuzhiyun bge,pt %xcc, 93b 563*4882a593Smuzhiyun add %o0, 8, %o0 564*4882a593Smuzhiyun 565*4882a593Smuzhiyun95: brz,pt %o2, 2f 566*4882a593Smuzhiyun mov %g1, %o1 567*4882a593Smuzhiyun 568*4882a593Smuzhiyun1: EX_LD_FP(LOAD(ldub, %o1, %o3), U1_o2_0_fp) 569*4882a593Smuzhiyun add %o1, 1, %o1 570*4882a593Smuzhiyun subcc %o2, 1, %o2 571*4882a593Smuzhiyun EX_ST_FP(STORE(stb, %o3, %o0), U1_o2_1_fp) 572*4882a593Smuzhiyun bne,pt %xcc, 1b 573*4882a593Smuzhiyun add %o0, 1, %o0 574*4882a593Smuzhiyun 575*4882a593Smuzhiyun2: membar #StoreLoad | #StoreStore 576*4882a593Smuzhiyun VISExit 577*4882a593Smuzhiyun retl 578*4882a593Smuzhiyun mov EX_RETVAL(%o4), %o0 579*4882a593Smuzhiyun 580*4882a593Smuzhiyun .align 64 581*4882a593Smuzhiyun70: /* 16 < len <= (5 * 64) */ 582*4882a593Smuzhiyun bne,pn %XCC, 75f 583*4882a593Smuzhiyun sub %o0, %o1, %o3 584*4882a593Smuzhiyun 585*4882a593Smuzhiyun72: andn %o2, 0xf, %GLOBAL_SPARE 586*4882a593Smuzhiyun and %o2, 0xf, %o2 587*4882a593Smuzhiyun1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U1_gs_0) 588*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U1_gs_0) 589*4882a593Smuzhiyun subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE 590*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o1 + %o3), U1_gs_10) 591*4882a593Smuzhiyun add %o1, 0x8, %o1 592*4882a593Smuzhiyun EX_ST(STORE(stx, %g1, %o1 + %o3), U1_gs_8) 593*4882a593Smuzhiyun bgu,pt %XCC, 1b 594*4882a593Smuzhiyun add %o1, 0x8, %o1 595*4882a593Smuzhiyun73: andcc %o2, 0x8, %g0 596*4882a593Smuzhiyun be,pt %XCC, 1f 597*4882a593Smuzhiyun nop 598*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1, %o5), U1_o2_0) 599*4882a593Smuzhiyun sub %o2, 0x8, %o2 600*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o1 + %o3), U1_o2_8) 601*4882a593Smuzhiyun add %o1, 0x8, %o1 602*4882a593Smuzhiyun1: andcc %o2, 0x4, %g0 603*4882a593Smuzhiyun be,pt %XCC, 1f 604*4882a593Smuzhiyun nop 605*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o1, %o5), U1_o2_0) 606*4882a593Smuzhiyun sub %o2, 0x4, %o2 607*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + %o3), U1_o2_4) 608*4882a593Smuzhiyun add %o1, 0x4, %o1 609*4882a593Smuzhiyun1: cmp %o2, 0 610*4882a593Smuzhiyun be,pt %XCC, 85f 611*4882a593Smuzhiyun nop 612*4882a593Smuzhiyun ba,pt %xcc, 90f 613*4882a593Smuzhiyun nop 614*4882a593Smuzhiyun 615*4882a593Smuzhiyun75: andcc %o0, 0x7, %g1 616*4882a593Smuzhiyun sub %g1, 0x8, %g1 617*4882a593Smuzhiyun be,pn %icc, 2f 618*4882a593Smuzhiyun sub %g0, %g1, %g1 619*4882a593Smuzhiyun sub %o2, %g1, %o2 620*4882a593Smuzhiyun 621*4882a593Smuzhiyun1: EX_LD(LOAD(ldub, %o1, %o5), U1_g1_0) 622*4882a593Smuzhiyun subcc %g1, 1, %g1 623*4882a593Smuzhiyun EX_ST(STORE(stb, %o5, %o1 + %o3), U1_g1_1) 624*4882a593Smuzhiyun bgu,pt %icc, 1b 625*4882a593Smuzhiyun add %o1, 1, %o1 626*4882a593Smuzhiyun 627*4882a593Smuzhiyun2: add %o1, %o3, %o0 628*4882a593Smuzhiyun andcc %o1, 0x7, %g1 629*4882a593Smuzhiyun bne,pt %icc, 8f 630*4882a593Smuzhiyun sll %g1, 3, %g1 631*4882a593Smuzhiyun 632*4882a593Smuzhiyun cmp %o2, 16 633*4882a593Smuzhiyun bgeu,pt %icc, 72b 634*4882a593Smuzhiyun nop 635*4882a593Smuzhiyun ba,a,pt %xcc, 73b 636*4882a593Smuzhiyun 637*4882a593Smuzhiyun8: mov 64, %o3 638*4882a593Smuzhiyun andn %o1, 0x7, %o1 639*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1, %g2), U1_o2_0) 640*4882a593Smuzhiyun sub %o3, %g1, %o3 641*4882a593Smuzhiyun andn %o2, 0x7, %GLOBAL_SPARE 642*4882a593Smuzhiyun sllx %g2, %g1, %g2 643*4882a593Smuzhiyun1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U1_gs_0_o2_adj) 644*4882a593Smuzhiyun subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE 645*4882a593Smuzhiyun add %o1, 0x8, %o1 646*4882a593Smuzhiyun srlx %g3, %o3, %o5 647*4882a593Smuzhiyun or %o5, %g2, %o5 648*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o0), U1_gs_8_o2_adj) 649*4882a593Smuzhiyun add %o0, 0x8, %o0 650*4882a593Smuzhiyun bgu,pt %icc, 1b 651*4882a593Smuzhiyun sllx %g3, %g1, %g2 652*4882a593Smuzhiyun 653*4882a593Smuzhiyun srl %g1, 3, %g1 654*4882a593Smuzhiyun andcc %o2, 0x7, %o2 655*4882a593Smuzhiyun be,pn %icc, 85f 656*4882a593Smuzhiyun add %o1, %g1, %o1 657*4882a593Smuzhiyun ba,pt %xcc, 90f 658*4882a593Smuzhiyun sub %o0, %o1, %o3 659*4882a593Smuzhiyun 660*4882a593Smuzhiyun .align 64 661*4882a593Smuzhiyun80: /* 0 < len <= 16 */ 662*4882a593Smuzhiyun andcc %o3, 0x3, %g0 663*4882a593Smuzhiyun bne,pn %XCC, 90f 664*4882a593Smuzhiyun sub %o0, %o1, %o3 665*4882a593Smuzhiyun 666*4882a593Smuzhiyun1: EX_LD(LOAD(lduw, %o1, %g1), U1_o2_0) 667*4882a593Smuzhiyun subcc %o2, 4, %o2 668*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + %o3), U1_o2_4) 669*4882a593Smuzhiyun bgu,pt %XCC, 1b 670*4882a593Smuzhiyun add %o1, 4, %o1 671*4882a593Smuzhiyun 672*4882a593Smuzhiyun85: retl 673*4882a593Smuzhiyun mov EX_RETVAL(%o4), %o0 674*4882a593Smuzhiyun 675*4882a593Smuzhiyun .align 32 676*4882a593Smuzhiyun90: EX_LD(LOAD(ldub, %o1, %g1), U1_o2_0) 677*4882a593Smuzhiyun subcc %o2, 1, %o2 678*4882a593Smuzhiyun EX_ST(STORE(stb, %g1, %o1 + %o3), U1_o2_1) 679*4882a593Smuzhiyun bgu,pt %XCC, 90b 680*4882a593Smuzhiyun add %o1, 1, %o1 681*4882a593Smuzhiyun retl 682*4882a593Smuzhiyun mov EX_RETVAL(%o4), %o0 683*4882a593Smuzhiyun 684*4882a593Smuzhiyun .size FUNC_NAME, .-FUNC_NAME 685*4882a593SmuzhiyunEXPORT_SYMBOL(FUNC_NAME) 686