1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* NG4memset.S: Niagara-4 optimized memset/bzero. 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Copyright (C) 2012 David S. Miller (davem@davemloft.net) 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun#include <asm/asi.h> 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun .register %g2, #scratch 10*4882a593Smuzhiyun .register %g3, #scratch 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun .text 13*4882a593Smuzhiyun .align 32 14*4882a593Smuzhiyun .globl NG4memset 15*4882a593SmuzhiyunNG4memset: 16*4882a593Smuzhiyun andcc %o1, 0xff, %o4 17*4882a593Smuzhiyun be,pt %icc, 1f 18*4882a593Smuzhiyun mov %o2, %o1 19*4882a593Smuzhiyun sllx %o4, 8, %g1 20*4882a593Smuzhiyun or %g1, %o4, %o2 21*4882a593Smuzhiyun sllx %o2, 16, %g1 22*4882a593Smuzhiyun or %g1, %o2, %o2 23*4882a593Smuzhiyun sllx %o2, 32, %g1 24*4882a593Smuzhiyun ba,pt %icc, 1f 25*4882a593Smuzhiyun or %g1, %o2, %o4 26*4882a593Smuzhiyun .size NG4memset,.-NG4memset 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun .align 32 29*4882a593Smuzhiyun .globl NG4bzero 30*4882a593SmuzhiyunNG4bzero: 31*4882a593Smuzhiyun clr %o4 32*4882a593Smuzhiyun1: cmp %o1, 16 33*4882a593Smuzhiyun ble %icc, .Ltiny 34*4882a593Smuzhiyun mov %o0, %o3 35*4882a593Smuzhiyun sub %g0, %o0, %g1 36*4882a593Smuzhiyun and %g1, 0x7, %g1 37*4882a593Smuzhiyun brz,pt %g1, .Laligned8 38*4882a593Smuzhiyun sub %o1, %g1, %o1 39*4882a593Smuzhiyun1: stb %o4, [%o0 + 0x00] 40*4882a593Smuzhiyun subcc %g1, 1, %g1 41*4882a593Smuzhiyun bne,pt %icc, 1b 42*4882a593Smuzhiyun add %o0, 1, %o0 43*4882a593Smuzhiyun.Laligned8: 44*4882a593Smuzhiyun cmp %o1, 64 + (64 - 8) 45*4882a593Smuzhiyun ble .Lmedium 46*4882a593Smuzhiyun sub %g0, %o0, %g1 47*4882a593Smuzhiyun andcc %g1, (64 - 1), %g1 48*4882a593Smuzhiyun brz,pn %g1, .Laligned64 49*4882a593Smuzhiyun sub %o1, %g1, %o1 50*4882a593Smuzhiyun1: stx %o4, [%o0 + 0x00] 51*4882a593Smuzhiyun subcc %g1, 8, %g1 52*4882a593Smuzhiyun bne,pt %icc, 1b 53*4882a593Smuzhiyun add %o0, 0x8, %o0 54*4882a593Smuzhiyun.Laligned64: 55*4882a593Smuzhiyun andn %o1, 64 - 1, %g1 56*4882a593Smuzhiyun sub %o1, %g1, %o1 57*4882a593Smuzhiyun brnz,pn %o4, .Lnon_bzero_loop 58*4882a593Smuzhiyun mov 0x20, %g2 59*4882a593Smuzhiyun1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P 60*4882a593Smuzhiyun subcc %g1, 0x40, %g1 61*4882a593Smuzhiyun stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P 62*4882a593Smuzhiyun bne,pt %icc, 1b 63*4882a593Smuzhiyun add %o0, 0x40, %o0 64*4882a593Smuzhiyun.Lpostloop: 65*4882a593Smuzhiyun cmp %o1, 8 66*4882a593Smuzhiyun bl,pn %icc, .Ltiny 67*4882a593Smuzhiyun membar #StoreStore|#StoreLoad 68*4882a593Smuzhiyun.Lmedium: 69*4882a593Smuzhiyun andn %o1, 0x7, %g1 70*4882a593Smuzhiyun sub %o1, %g1, %o1 71*4882a593Smuzhiyun1: stx %o4, [%o0 + 0x00] 72*4882a593Smuzhiyun subcc %g1, 0x8, %g1 73*4882a593Smuzhiyun bne,pt %icc, 1b 74*4882a593Smuzhiyun add %o0, 0x08, %o0 75*4882a593Smuzhiyun andcc %o1, 0x4, %g1 76*4882a593Smuzhiyun be,pt %icc, .Ltiny 77*4882a593Smuzhiyun sub %o1, %g1, %o1 78*4882a593Smuzhiyun stw %o4, [%o0 + 0x00] 79*4882a593Smuzhiyun add %o0, 0x4, %o0 80*4882a593Smuzhiyun.Ltiny: 81*4882a593Smuzhiyun cmp %o1, 0 82*4882a593Smuzhiyun be,pn %icc, .Lexit 83*4882a593Smuzhiyun1: subcc %o1, 1, %o1 84*4882a593Smuzhiyun stb %o4, [%o0 + 0x00] 85*4882a593Smuzhiyun bne,pt %icc, 1b 86*4882a593Smuzhiyun add %o0, 1, %o0 87*4882a593Smuzhiyun.Lexit: 88*4882a593Smuzhiyun retl 89*4882a593Smuzhiyun mov %o3, %o0 90*4882a593Smuzhiyun.Lnon_bzero_loop: 91*4882a593Smuzhiyun mov 0x08, %g3 92*4882a593Smuzhiyun mov 0x28, %o5 93*4882a593Smuzhiyun1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P 94*4882a593Smuzhiyun subcc %g1, 0x40, %g1 95*4882a593Smuzhiyun stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P 96*4882a593Smuzhiyun stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P 97*4882a593Smuzhiyun stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P 98*4882a593Smuzhiyun add %o0, 0x10, %o0 99*4882a593Smuzhiyun stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P 100*4882a593Smuzhiyun stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P 101*4882a593Smuzhiyun stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P 102*4882a593Smuzhiyun stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P 103*4882a593Smuzhiyun bne,pt %icc, 1b 104*4882a593Smuzhiyun add %o0, 0x30, %o0 105*4882a593Smuzhiyun ba,a,pt %icc, .Lpostloop 106*4882a593Smuzhiyun nop 107*4882a593Smuzhiyun .size NG4bzero,.-NG4bzero 108