1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * linux/arch/arm/lib/memset.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 1995-2000 Russell King 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * ASM optimised string functions 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun#include <linux/linkage.h> 10*4882a593Smuzhiyun#include <asm/assembler.h> 11*4882a593Smuzhiyun#include <asm/unwind.h> 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun .text 14*4882a593Smuzhiyun .align 5 15*4882a593Smuzhiyun 16*4882a593SmuzhiyunENTRY(mmioset) 17*4882a593SmuzhiyunENTRY(memset) 18*4882a593SmuzhiyunUNWIND( .fnstart ) 19*4882a593Smuzhiyun ands r3, r0, #3 @ 1 unaligned? 20*4882a593Smuzhiyun mov ip, r0 @ preserve r0 as return value 21*4882a593Smuzhiyun bne 6f @ 1 22*4882a593Smuzhiyun/* 23*4882a593Smuzhiyun * we know that the pointer in ip is aligned to a word boundary. 24*4882a593Smuzhiyun */ 25*4882a593Smuzhiyun1: orr r1, r1, r1, lsl #8 26*4882a593Smuzhiyun orr r1, r1, r1, lsl #16 27*4882a593Smuzhiyun mov r3, r1 28*4882a593Smuzhiyun7: cmp r2, #16 29*4882a593Smuzhiyun blt 4f 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun#if ! CALGN(1)+0 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun/* 34*4882a593Smuzhiyun * We need 2 extra registers for this loop - use r8 and the LR 35*4882a593Smuzhiyun */ 36*4882a593Smuzhiyun stmfd sp!, {r8, lr} 37*4882a593SmuzhiyunUNWIND( .fnend ) 38*4882a593SmuzhiyunUNWIND( .fnstart ) 39*4882a593SmuzhiyunUNWIND( .save {r8, lr} ) 40*4882a593Smuzhiyun mov r8, r1 41*4882a593Smuzhiyun mov lr, r3 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun2: subs r2, r2, #64 44*4882a593Smuzhiyun stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time. 45*4882a593Smuzhiyun stmiage ip!, {r1, r3, r8, lr} 46*4882a593Smuzhiyun stmiage ip!, {r1, r3, r8, lr} 47*4882a593Smuzhiyun stmiage ip!, {r1, r3, r8, lr} 48*4882a593Smuzhiyun bgt 2b 49*4882a593Smuzhiyun ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go. 50*4882a593Smuzhiyun/* 51*4882a593Smuzhiyun * No need to correct the count; we're only testing bits from now on 52*4882a593Smuzhiyun */ 53*4882a593Smuzhiyun tst r2, #32 54*4882a593Smuzhiyun stmiane ip!, {r1, r3, r8, lr} 55*4882a593Smuzhiyun stmiane ip!, {r1, r3, r8, lr} 56*4882a593Smuzhiyun tst r2, #16 57*4882a593Smuzhiyun stmiane ip!, {r1, r3, r8, lr} 58*4882a593Smuzhiyun ldmfd sp!, {r8, lr} 59*4882a593SmuzhiyunUNWIND( .fnend ) 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun#else 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun/* 64*4882a593Smuzhiyun * This version aligns the destination pointer in order to write 65*4882a593Smuzhiyun * whole cache lines at once. 66*4882a593Smuzhiyun */ 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun stmfd sp!, {r4-r8, lr} 69*4882a593SmuzhiyunUNWIND( .fnend ) 70*4882a593SmuzhiyunUNWIND( .fnstart ) 71*4882a593SmuzhiyunUNWIND( .save {r4-r8, lr} ) 72*4882a593Smuzhiyun mov r4, r1 73*4882a593Smuzhiyun mov r5, r3 74*4882a593Smuzhiyun mov r6, r1 75*4882a593Smuzhiyun mov r7, r3 76*4882a593Smuzhiyun mov r8, r1 77*4882a593Smuzhiyun mov lr, r3 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun cmp r2, #96 80*4882a593Smuzhiyun tstgt ip, #31 81*4882a593Smuzhiyun ble 3f 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun and r8, ip, #31 84*4882a593Smuzhiyun rsb r8, r8, #32 85*4882a593Smuzhiyun sub r2, r2, r8 86*4882a593Smuzhiyun movs r8, r8, lsl #(32 - 4) 87*4882a593Smuzhiyun stmiacs ip!, {r4, r5, r6, r7} 88*4882a593Smuzhiyun stmiami ip!, {r4, r5} 89*4882a593Smuzhiyun tst r8, #(1 << 30) 90*4882a593Smuzhiyun mov r8, r1 91*4882a593Smuzhiyun strne r1, [ip], #4 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun3: subs r2, r2, #64 94*4882a593Smuzhiyun stmiage ip!, {r1, r3-r8, lr} 95*4882a593Smuzhiyun stmiage ip!, {r1, r3-r8, lr} 96*4882a593Smuzhiyun bgt 3b 97*4882a593Smuzhiyun ldmfdeq sp!, {r4-r8, pc} 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun tst r2, #32 100*4882a593Smuzhiyun stmiane ip!, {r1, r3-r8, lr} 101*4882a593Smuzhiyun tst r2, #16 102*4882a593Smuzhiyun stmiane ip!, {r4-r7} 103*4882a593Smuzhiyun ldmfd sp!, {r4-r8, lr} 104*4882a593SmuzhiyunUNWIND( .fnend ) 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun#endif 107*4882a593Smuzhiyun 108*4882a593SmuzhiyunUNWIND( .fnstart ) 109*4882a593Smuzhiyun4: tst r2, #8 110*4882a593Smuzhiyun stmiane ip!, {r1, r3} 111*4882a593Smuzhiyun tst r2, #4 112*4882a593Smuzhiyun strne r1, [ip], #4 113*4882a593Smuzhiyun/* 114*4882a593Smuzhiyun * When we get here, we've got less than 4 bytes to set. We 115*4882a593Smuzhiyun * may have an unaligned pointer as well. 116*4882a593Smuzhiyun */ 117*4882a593Smuzhiyun5: tst r2, #2 118*4882a593Smuzhiyun strbne r1, [ip], #1 119*4882a593Smuzhiyun strbne r1, [ip], #1 120*4882a593Smuzhiyun tst r2, #1 121*4882a593Smuzhiyun strbne r1, [ip], #1 122*4882a593Smuzhiyun ret lr 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun6: subs r2, r2, #4 @ 1 do we have enough 125*4882a593Smuzhiyun blt 5b @ 1 bytes to align with? 126*4882a593Smuzhiyun cmp r3, #2 @ 1 127*4882a593Smuzhiyun strblt r1, [ip], #1 @ 1 128*4882a593Smuzhiyun strble r1, [ip], #1 @ 1 129*4882a593Smuzhiyun strb r1, [ip], #1 @ 1 130*4882a593Smuzhiyun add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) 131*4882a593Smuzhiyun b 1b 132*4882a593SmuzhiyunUNWIND( .fnend ) 133*4882a593SmuzhiyunENDPROC(memset) 134*4882a593SmuzhiyunENDPROC(mmioset) 135*4882a593Smuzhiyun 136*4882a593SmuzhiyunENTRY(__memset32) 137*4882a593SmuzhiyunUNWIND( .fnstart ) 138*4882a593Smuzhiyun mov r3, r1 @ copy r1 to r3 and fall into memset64 139*4882a593SmuzhiyunUNWIND( .fnend ) 140*4882a593SmuzhiyunENDPROC(__memset32) 141*4882a593SmuzhiyunENTRY(__memset64) 142*4882a593SmuzhiyunUNWIND( .fnstart ) 143*4882a593Smuzhiyun mov ip, r0 @ preserve r0 as return value 144*4882a593Smuzhiyun b 7b @ jump into the middle of memset 145*4882a593SmuzhiyunUNWIND( .fnend ) 146*4882a593SmuzhiyunENDPROC(__memset64) 147