1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* U3memcpy.S: UltraSparc-III optimized memcpy. 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun#ifdef __KERNEL__ 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/visasm.h> 10*4882a593Smuzhiyun#include <asm/asi.h> 11*4882a593Smuzhiyun#define GLOBAL_SPARE %g7 12*4882a593Smuzhiyun#else 13*4882a593Smuzhiyun#define ASI_BLK_P 0xf0 14*4882a593Smuzhiyun#define FPRS_FEF 0x04 15*4882a593Smuzhiyun#ifdef MEMCPY_DEBUG 16*4882a593Smuzhiyun#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 17*4882a593Smuzhiyun clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 18*4882a593Smuzhiyun#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 19*4882a593Smuzhiyun#else 20*4882a593Smuzhiyun#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 21*4882a593Smuzhiyun#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 22*4882a593Smuzhiyun#endif 23*4882a593Smuzhiyun#define GLOBAL_SPARE %g5 24*4882a593Smuzhiyun#endif 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun#ifndef EX_LD 27*4882a593Smuzhiyun#define EX_LD(x,y) x 28*4882a593Smuzhiyun#endif 29*4882a593Smuzhiyun#ifndef EX_LD_FP 30*4882a593Smuzhiyun#define EX_LD_FP(x,y) x 31*4882a593Smuzhiyun#endif 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun#ifndef EX_ST 34*4882a593Smuzhiyun#define EX_ST(x,y) x 35*4882a593Smuzhiyun#endif 36*4882a593Smuzhiyun#ifndef EX_ST_FP 37*4882a593Smuzhiyun#define EX_ST_FP(x,y) x 38*4882a593Smuzhiyun#endif 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun#ifndef LOAD 41*4882a593Smuzhiyun#define LOAD(type,addr,dest) type [addr], dest 42*4882a593Smuzhiyun#endif 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun#ifndef STORE 45*4882a593Smuzhiyun#define STORE(type,src,addr) type src, [addr] 46*4882a593Smuzhiyun#endif 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun#ifndef STORE_BLK 49*4882a593Smuzhiyun#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 50*4882a593Smuzhiyun#endif 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun#ifndef FUNC_NAME 53*4882a593Smuzhiyun#define FUNC_NAME U3memcpy 54*4882a593Smuzhiyun#endif 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun#ifndef PREAMBLE 57*4882a593Smuzhiyun#define PREAMBLE 58*4882a593Smuzhiyun#endif 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun#ifndef XCC 61*4882a593Smuzhiyun#define XCC xcc 62*4882a593Smuzhiyun#endif 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun .register %g2,#scratch 65*4882a593Smuzhiyun .register %g3,#scratch 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun /* Special/non-trivial issues of this code: 68*4882a593Smuzhiyun * 69*4882a593Smuzhiyun * 1) %o5 is preserved from VISEntryHalf to VISExitHalf 70*4882a593Smuzhiyun * 2) Only low 32 FPU registers are used so that only the 71*4882a593Smuzhiyun * lower half of the FPU register set is dirtied by this 72*4882a593Smuzhiyun * code. This is especially important in the kernel. 73*4882a593Smuzhiyun * 3) This code never prefetches cachelines past the end 74*4882a593Smuzhiyun * of the source buffer. 75*4882a593Smuzhiyun */ 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun .text 78*4882a593Smuzhiyun#ifndef EX_RETVAL 79*4882a593Smuzhiyun#define EX_RETVAL(x) x 80*4882a593Smuzhiyun__restore_fp: 81*4882a593Smuzhiyun VISExitHalf 82*4882a593Smuzhiyun retl 83*4882a593Smuzhiyun nop 84*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp) 85*4882a593Smuzhiyun add %g1, 1, %g1 86*4882a593Smuzhiyun add %g2, %g1, %g2 87*4882a593Smuzhiyun ba,pt %xcc, __restore_fp 88*4882a593Smuzhiyun add %o2, %g2, %o0 89*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp) 90*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g2_fp) 91*4882a593Smuzhiyun ba,pt %xcc, __restore_fp 92*4882a593Smuzhiyun add %o2, %g2, %o0 93*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g2_fp) 94*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g2_plus_8_fp) 95*4882a593Smuzhiyun add %g2, 8, %g2 96*4882a593Smuzhiyun ba,pt %xcc, __restore_fp 97*4882a593Smuzhiyun add %o2, %g2, %o0 98*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g2_plus_8_fp) 99*4882a593SmuzhiyunENTRY(U3_retl_o2) 100*4882a593Smuzhiyun retl 101*4882a593Smuzhiyun mov %o2, %o0 102*4882a593SmuzhiyunENDPROC(U3_retl_o2) 103*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_1) 104*4882a593Smuzhiyun retl 105*4882a593Smuzhiyun add %o2, 1, %o0 106*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_1) 107*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_4) 108*4882a593Smuzhiyun retl 109*4882a593Smuzhiyun add %o2, 4, %o0 110*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_4) 111*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_8) 112*4882a593Smuzhiyun retl 113*4882a593Smuzhiyun add %o2, 8, %o0 114*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_8) 115*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g1_plus_1) 116*4882a593Smuzhiyun add %g1, 1, %g1 117*4882a593Smuzhiyun retl 118*4882a593Smuzhiyun add %o2, %g1, %o0 119*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g1_plus_1) 120*4882a593SmuzhiyunENTRY(U3_retl_o2_fp) 121*4882a593Smuzhiyun ba,pt %xcc, __restore_fp 122*4882a593Smuzhiyun mov %o2, %o0 123*4882a593SmuzhiyunENDPROC(U3_retl_o2_fp) 124*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp) 125*4882a593Smuzhiyun sll %o3, 6, %o3 126*4882a593Smuzhiyun add %o3, 0x80, %o3 127*4882a593Smuzhiyun ba,pt %xcc, __restore_fp 128*4882a593Smuzhiyun add %o2, %o3, %o0 129*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp) 130*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp) 131*4882a593Smuzhiyun sll %o3, 6, %o3 132*4882a593Smuzhiyun add %o3, 0x40, %o3 133*4882a593Smuzhiyun ba,pt %xcc, __restore_fp 134*4882a593Smuzhiyun add %o2, %o3, %o0 135*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp) 136*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_GS_plus_0x10) 137*4882a593Smuzhiyun add GLOBAL_SPARE, 0x10, GLOBAL_SPARE 138*4882a593Smuzhiyun retl 139*4882a593Smuzhiyun add %o2, GLOBAL_SPARE, %o0 140*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_GS_plus_0x10) 141*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_GS_plus_0x08) 142*4882a593Smuzhiyun add GLOBAL_SPARE, 0x08, GLOBAL_SPARE 143*4882a593Smuzhiyun retl 144*4882a593Smuzhiyun add %o2, GLOBAL_SPARE, %o0 145*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_GS_plus_0x08) 146*4882a593SmuzhiyunENTRY(U3_retl_o2_and_7_plus_GS) 147*4882a593Smuzhiyun and %o2, 7, %o2 148*4882a593Smuzhiyun retl 149*4882a593Smuzhiyun add %o2, GLOBAL_SPARE, %o0 150*4882a593SmuzhiyunENDPROC(U3_retl_o2_and_7_plus_GS) 151*4882a593SmuzhiyunENTRY(U3_retl_o2_and_7_plus_GS_plus_8) 152*4882a593Smuzhiyun add GLOBAL_SPARE, 8, GLOBAL_SPARE 153*4882a593Smuzhiyun and %o2, 7, %o2 154*4882a593Smuzhiyun retl 155*4882a593Smuzhiyun add %o2, GLOBAL_SPARE, %o0 156*4882a593SmuzhiyunENDPROC(U3_retl_o2_and_7_plus_GS_plus_8) 157*4882a593Smuzhiyun#endif 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun .align 64 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun /* The cheetah's flexible spine, oversized liver, enlarged heart, 162*4882a593Smuzhiyun * slender muscular body, and claws make it the swiftest hunter 163*4882a593Smuzhiyun * in Africa and the fastest animal on land. Can reach speeds 164*4882a593Smuzhiyun * of up to 2.4GB per second. 165*4882a593Smuzhiyun */ 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun .globl FUNC_NAME 168*4882a593Smuzhiyun .type FUNC_NAME,#function 169*4882a593SmuzhiyunFUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 170*4882a593Smuzhiyun srlx %o2, 31, %g2 171*4882a593Smuzhiyun cmp %g2, 0 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun /* software trap 5 "Range Check" if dst >= 0x80000000 */ 174*4882a593Smuzhiyun tne %xcc, 5 175*4882a593Smuzhiyun PREAMBLE 176*4882a593Smuzhiyun mov %o0, %o4 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun /* if len == 0 */ 179*4882a593Smuzhiyun cmp %o2, 0 180*4882a593Smuzhiyun be,pn %XCC, end_return 181*4882a593Smuzhiyun or %o0, %o1, %o3 182*4882a593Smuzhiyun 183*4882a593Smuzhiyun /* if len < 16 */ 184*4882a593Smuzhiyun cmp %o2, 16 185*4882a593Smuzhiyun blu,a,pn %XCC, less_than_16 186*4882a593Smuzhiyun or %o3, %o2, %o3 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun /* if len < 192 */ 189*4882a593Smuzhiyun cmp %o2, (3 * 64) 190*4882a593Smuzhiyun blu,pt %XCC, less_than_192 191*4882a593Smuzhiyun andcc %o3, 0x7, %g0 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve 194*4882a593Smuzhiyun * o5 from here until we hit VISExitHalf. 195*4882a593Smuzhiyun */ 196*4882a593Smuzhiyun VISEntryHalf 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun /* Is 'dst' already aligned on an 64-byte boundary? */ 199*4882a593Smuzhiyun andcc %o0, 0x3f, %g2 200*4882a593Smuzhiyun be,pt %XCC, 2f 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 203*4882a593Smuzhiyun * of bytes to copy to make 'dst' 64-byte aligned. We pre- 204*4882a593Smuzhiyun * subtract this from 'len'. 205*4882a593Smuzhiyun */ 206*4882a593Smuzhiyun sub %o0, %o1, GLOBAL_SPARE 207*4882a593Smuzhiyun sub %g2, 0x40, %g2 208*4882a593Smuzhiyun sub %g0, %g2, %g2 209*4882a593Smuzhiyun sub %o2, %g2, %o2 210*4882a593Smuzhiyun andcc %g2, 0x7, %g1 211*4882a593Smuzhiyun be,pt %icc, 2f 212*4882a593Smuzhiyun and %g2, 0x38, %g2 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun1: subcc %g1, 0x1, %g1 215*4882a593Smuzhiyun EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1) 216*4882a593Smuzhiyun EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1) 217*4882a593Smuzhiyun bgu,pt %XCC, 1b 218*4882a593Smuzhiyun add %o1, 0x1, %o1 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun add %o1, GLOBAL_SPARE, %o0 221*4882a593Smuzhiyun 222*4882a593Smuzhiyun2: cmp %g2, 0x0 223*4882a593Smuzhiyun and %o1, 0x7, %g1 224*4882a593Smuzhiyun be,pt %icc, 3f 225*4882a593Smuzhiyun alignaddr %o1, %g0, %o1 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2) 228*4882a593Smuzhiyun1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2) 229*4882a593Smuzhiyun add %o1, 0x8, %o1 230*4882a593Smuzhiyun subcc %g2, 0x8, %g2 231*4882a593Smuzhiyun faligndata %f4, %f6, %f0 232*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8) 233*4882a593Smuzhiyun be,pn %icc, 3f 234*4882a593Smuzhiyun add %o0, 0x8, %o0 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2) 237*4882a593Smuzhiyun add %o1, 0x8, %o1 238*4882a593Smuzhiyun subcc %g2, 0x8, %g2 239*4882a593Smuzhiyun faligndata %f6, %f4, %f2 240*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8) 241*4882a593Smuzhiyun bne,pt %icc, 1b 242*4882a593Smuzhiyun add %o0, 0x8, %o0 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun3: LOAD(prefetch, %o1 + 0x000, #one_read) 245*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x040, #one_read) 246*4882a593Smuzhiyun andn %o2, (0x40 - 1), GLOBAL_SPARE 247*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x080, #one_read) 248*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x0c0, #one_read) 249*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x100, #one_read) 250*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2) 251*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x140, #one_read) 252*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2) 253*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x180, #one_read) 254*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2) 255*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x1c0, #one_read) 256*4882a593Smuzhiyun faligndata %f0, %f2, %f16 257*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2) 258*4882a593Smuzhiyun faligndata %f2, %f4, %f18 259*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2) 260*4882a593Smuzhiyun faligndata %f4, %f6, %f20 261*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2) 262*4882a593Smuzhiyun faligndata %f6, %f8, %f22 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2) 265*4882a593Smuzhiyun faligndata %f8, %f10, %f24 266*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2) 267*4882a593Smuzhiyun faligndata %f10, %f12, %f26 268*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2) 269*4882a593Smuzhiyun 270*4882a593Smuzhiyun subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE 271*4882a593Smuzhiyun add %o1, 0x40, %o1 272*4882a593Smuzhiyun bgu,pt %XCC, 1f 273*4882a593Smuzhiyun srl GLOBAL_SPARE, 6, %o3 274*4882a593Smuzhiyun ba,pt %xcc, 2f 275*4882a593Smuzhiyun nop 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun .align 64 278*4882a593Smuzhiyun1: 279*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80) 280*4882a593Smuzhiyun faligndata %f12, %f14, %f28 281*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80) 282*4882a593Smuzhiyun faligndata %f14, %f0, %f30 283*4882a593Smuzhiyun EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 284*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40) 285*4882a593Smuzhiyun faligndata %f0, %f2, %f16 286*4882a593Smuzhiyun add %o0, 0x40, %o0 287*4882a593Smuzhiyun 288*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40) 289*4882a593Smuzhiyun faligndata %f2, %f4, %f18 290*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40) 291*4882a593Smuzhiyun faligndata %f4, %f6, %f20 292*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40) 293*4882a593Smuzhiyun subcc %o3, 0x01, %o3 294*4882a593Smuzhiyun faligndata %f6, %f8, %f22 295*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80) 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun faligndata %f8, %f10, %f24 298*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 299*4882a593Smuzhiyun LOAD(prefetch, %o1 + 0x1c0, #one_read) 300*4882a593Smuzhiyun faligndata %f10, %f12, %f26 301*4882a593Smuzhiyun bg,pt %XCC, 1b 302*4882a593Smuzhiyun add %o1, 0x40, %o1 303*4882a593Smuzhiyun 304*4882a593Smuzhiyun /* Finally we copy the last full 64-byte block. */ 305*4882a593Smuzhiyun2: 306*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80) 307*4882a593Smuzhiyun faligndata %f12, %f14, %f28 308*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80) 309*4882a593Smuzhiyun faligndata %f14, %f0, %f30 310*4882a593Smuzhiyun EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 311*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40) 312*4882a593Smuzhiyun faligndata %f0, %f2, %f16 313*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40) 314*4882a593Smuzhiyun faligndata %f2, %f4, %f18 315*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40) 316*4882a593Smuzhiyun faligndata %f4, %f6, %f20 317*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40) 318*4882a593Smuzhiyun faligndata %f6, %f8, %f22 319*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40) 320*4882a593Smuzhiyun faligndata %f8, %f10, %f24 321*4882a593Smuzhiyun cmp %g1, 0 322*4882a593Smuzhiyun be,pt %XCC, 1f 323*4882a593Smuzhiyun add %o0, 0x40, %o0 324*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40) 325*4882a593Smuzhiyun1: faligndata %f10, %f12, %f26 326*4882a593Smuzhiyun faligndata %f12, %f14, %f28 327*4882a593Smuzhiyun faligndata %f14, %f0, %f30 328*4882a593Smuzhiyun EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40) 329*4882a593Smuzhiyun add %o0, 0x40, %o0 330*4882a593Smuzhiyun add %o1, 0x40, %o1 331*4882a593Smuzhiyun membar #Sync 332*4882a593Smuzhiyun 333*4882a593Smuzhiyun /* Now we copy the (len modulo 64) bytes at the end. 334*4882a593Smuzhiyun * Note how we borrow the %f0 loaded above. 335*4882a593Smuzhiyun * 336*4882a593Smuzhiyun * Also notice how this code is careful not to perform a 337*4882a593Smuzhiyun * load past the end of the src buffer. 338*4882a593Smuzhiyun */ 339*4882a593Smuzhiyun and %o2, 0x3f, %o2 340*4882a593Smuzhiyun andcc %o2, 0x38, %g2 341*4882a593Smuzhiyun be,pn %XCC, 2f 342*4882a593Smuzhiyun subcc %g2, 0x8, %g2 343*4882a593Smuzhiyun be,pn %XCC, 2f 344*4882a593Smuzhiyun cmp %g1, 0 345*4882a593Smuzhiyun 346*4882a593Smuzhiyun sub %o2, %g2, %o2 347*4882a593Smuzhiyun be,a,pt %XCC, 1f 348*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2) 349*4882a593Smuzhiyun 350*4882a593Smuzhiyun1: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2) 351*4882a593Smuzhiyun add %o1, 0x8, %o1 352*4882a593Smuzhiyun subcc %g2, 0x8, %g2 353*4882a593Smuzhiyun faligndata %f0, %f2, %f8 354*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8) 355*4882a593Smuzhiyun be,pn %XCC, 2f 356*4882a593Smuzhiyun add %o0, 0x8, %o0 357*4882a593Smuzhiyun EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2) 358*4882a593Smuzhiyun add %o1, 0x8, %o1 359*4882a593Smuzhiyun subcc %g2, 0x8, %g2 360*4882a593Smuzhiyun faligndata %f2, %f0, %f8 361*4882a593Smuzhiyun EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8) 362*4882a593Smuzhiyun bne,pn %XCC, 1b 363*4882a593Smuzhiyun add %o0, 0x8, %o0 364*4882a593Smuzhiyun 365*4882a593Smuzhiyun /* If anything is left, we copy it one byte at a time. 366*4882a593Smuzhiyun * Note that %g1 is (src & 0x3) saved above before the 367*4882a593Smuzhiyun * alignaddr was performed. 368*4882a593Smuzhiyun */ 369*4882a593Smuzhiyun2: 370*4882a593Smuzhiyun cmp %o2, 0 371*4882a593Smuzhiyun add %o1, %g1, %o1 372*4882a593Smuzhiyun VISExitHalf 373*4882a593Smuzhiyun be,pn %XCC, end_return 374*4882a593Smuzhiyun sub %o0, %o1, %o3 375*4882a593Smuzhiyun 376*4882a593Smuzhiyun andcc %g1, 0x7, %g0 377*4882a593Smuzhiyun bne,pn %icc, 90f 378*4882a593Smuzhiyun andcc %o2, 0x8, %g0 379*4882a593Smuzhiyun be,pt %icc, 1f 380*4882a593Smuzhiyun nop 381*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2) 382*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2) 383*4882a593Smuzhiyun add %o1, 0x8, %o1 384*4882a593Smuzhiyun sub %o2, 8, %o2 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun1: andcc %o2, 0x4, %g0 387*4882a593Smuzhiyun be,pt %icc, 1f 388*4882a593Smuzhiyun nop 389*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2) 390*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2) 391*4882a593Smuzhiyun add %o1, 0x4, %o1 392*4882a593Smuzhiyun sub %o2, 4, %o2 393*4882a593Smuzhiyun 394*4882a593Smuzhiyun1: andcc %o2, 0x2, %g0 395*4882a593Smuzhiyun be,pt %icc, 1f 396*4882a593Smuzhiyun nop 397*4882a593Smuzhiyun EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2) 398*4882a593Smuzhiyun EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2) 399*4882a593Smuzhiyun add %o1, 0x2, %o1 400*4882a593Smuzhiyun sub %o2, 2, %o2 401*4882a593Smuzhiyun 402*4882a593Smuzhiyun1: andcc %o2, 0x1, %g0 403*4882a593Smuzhiyun be,pt %icc, end_return 404*4882a593Smuzhiyun nop 405*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2) 406*4882a593Smuzhiyun ba,pt %xcc, end_return 407*4882a593Smuzhiyun EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2) 408*4882a593Smuzhiyun 409*4882a593Smuzhiyun .align 64 410*4882a593Smuzhiyun /* 16 <= len < 192 */ 411*4882a593Smuzhiyunless_than_192: 412*4882a593Smuzhiyun bne,pn %XCC, 75f 413*4882a593Smuzhiyun sub %o0, %o1, %o3 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun72: 416*4882a593Smuzhiyun andn %o2, 0xf, GLOBAL_SPARE 417*4882a593Smuzhiyun and %o2, 0xf, %o2 418*4882a593Smuzhiyun1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE 419*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10) 420*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10) 421*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10) 422*4882a593Smuzhiyun add %o1, 0x8, %o1 423*4882a593Smuzhiyun EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08) 424*4882a593Smuzhiyun bgu,pt %XCC, 1b 425*4882a593Smuzhiyun add %o1, 0x8, %o1 426*4882a593Smuzhiyun73: andcc %o2, 0x8, %g0 427*4882a593Smuzhiyun be,pt %XCC, 1f 428*4882a593Smuzhiyun nop 429*4882a593Smuzhiyun sub %o2, 0x8, %o2 430*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8) 431*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8) 432*4882a593Smuzhiyun add %o1, 0x8, %o1 433*4882a593Smuzhiyun1: andcc %o2, 0x4, %g0 434*4882a593Smuzhiyun be,pt %XCC, 1f 435*4882a593Smuzhiyun nop 436*4882a593Smuzhiyun sub %o2, 0x4, %o2 437*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4) 438*4882a593Smuzhiyun EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4) 439*4882a593Smuzhiyun add %o1, 0x4, %o1 440*4882a593Smuzhiyun1: cmp %o2, 0 441*4882a593Smuzhiyun be,pt %XCC, end_return 442*4882a593Smuzhiyun nop 443*4882a593Smuzhiyun ba,pt %xcc, 90f 444*4882a593Smuzhiyun nop 445*4882a593Smuzhiyun 446*4882a593Smuzhiyun75: 447*4882a593Smuzhiyun andcc %o0, 0x7, %g1 448*4882a593Smuzhiyun sub %g1, 0x8, %g1 449*4882a593Smuzhiyun be,pn %icc, 2f 450*4882a593Smuzhiyun sub %g0, %g1, %g1 451*4882a593Smuzhiyun sub %o2, %g1, %o2 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun1: subcc %g1, 1, %g1 454*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1) 455*4882a593Smuzhiyun EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1) 456*4882a593Smuzhiyun bgu,pt %icc, 1b 457*4882a593Smuzhiyun add %o1, 1, %o1 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun2: add %o1, %o3, %o0 460*4882a593Smuzhiyun andcc %o1, 0x7, %g1 461*4882a593Smuzhiyun bne,pt %icc, 8f 462*4882a593Smuzhiyun sll %g1, 3, %g1 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun cmp %o2, 16 465*4882a593Smuzhiyun bgeu,pt %icc, 72b 466*4882a593Smuzhiyun nop 467*4882a593Smuzhiyun ba,a,pt %xcc, 73b 468*4882a593Smuzhiyun 469*4882a593Smuzhiyun8: mov 64, %o3 470*4882a593Smuzhiyun andn %o1, 0x7, %o1 471*4882a593Smuzhiyun EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2) 472*4882a593Smuzhiyun sub %o3, %g1, %o3 473*4882a593Smuzhiyun andn %o2, 0x7, GLOBAL_SPARE 474*4882a593Smuzhiyun sllx %g2, %g1, %g2 475*4882a593Smuzhiyun1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS) 476*4882a593Smuzhiyun subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE 477*4882a593Smuzhiyun add %o1, 0x8, %o1 478*4882a593Smuzhiyun srlx %g3, %o3, %o5 479*4882a593Smuzhiyun or %o5, %g2, %o5 480*4882a593Smuzhiyun EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8) 481*4882a593Smuzhiyun add %o0, 0x8, %o0 482*4882a593Smuzhiyun bgu,pt %icc, 1b 483*4882a593Smuzhiyun sllx %g3, %g1, %g2 484*4882a593Smuzhiyun 485*4882a593Smuzhiyun srl %g1, 3, %g1 486*4882a593Smuzhiyun andcc %o2, 0x7, %o2 487*4882a593Smuzhiyun be,pn %icc, end_return 488*4882a593Smuzhiyun add %o1, %g1, %o1 489*4882a593Smuzhiyun ba,pt %xcc, 90f 490*4882a593Smuzhiyun sub %o0, %o1, %o3 491*4882a593Smuzhiyun 492*4882a593Smuzhiyun .align 64 493*4882a593Smuzhiyun /* 0 < len < 16 */ 494*4882a593Smuzhiyunless_than_16: 495*4882a593Smuzhiyun andcc %o3, 0x3, %g0 496*4882a593Smuzhiyun bne,pn %XCC, 90f 497*4882a593Smuzhiyun sub %o0, %o1, %o3 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun1: 500*4882a593Smuzhiyun subcc %o2, 4, %o2 501*4882a593Smuzhiyun EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4) 502*4882a593Smuzhiyun EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4) 503*4882a593Smuzhiyun bgu,pt %XCC, 1b 504*4882a593Smuzhiyun add %o1, 4, %o1 505*4882a593Smuzhiyun 506*4882a593Smuzhiyunend_return: 507*4882a593Smuzhiyun retl 508*4882a593Smuzhiyun mov EX_RETVAL(%o4), %o0 509*4882a593Smuzhiyun 510*4882a593Smuzhiyun .align 32 511*4882a593Smuzhiyun90: 512*4882a593Smuzhiyun subcc %o2, 1, %o2 513*4882a593Smuzhiyun EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1) 514*4882a593Smuzhiyun EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1) 515*4882a593Smuzhiyun bgu,pt %XCC, 90b 516*4882a593Smuzhiyun add %o1, 1, %o1 517*4882a593Smuzhiyun retl 518*4882a593Smuzhiyun mov EX_RETVAL(%o4), %o0 519*4882a593Smuzhiyun 520*4882a593Smuzhiyun .size FUNC_NAME, .-FUNC_NAME 521