xref: /OK3568_Linux_fs/kernel/arch/sh/lib/memset-sh4.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * "memset" implementation for SH4
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 1999  Niibe Yutaka
6*4882a593Smuzhiyun * Copyright (c) 2009  STMicroelectronics Limited
7*4882a593Smuzhiyun * Author: Stuart Menefy <stuart.menefy:st.com>
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun/*
11*4882a593Smuzhiyun *            void *memset(void *s, int c, size_t n);
12*4882a593Smuzhiyun */
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun#include <linux/linkage.h>
15*4882a593Smuzhiyun
16*4882a593SmuzhiyunENTRY(memset)
17*4882a593Smuzhiyun	mov	#12,r0
18*4882a593Smuzhiyun	add	r6,r4
19*4882a593Smuzhiyun	cmp/gt	r6,r0
20*4882a593Smuzhiyun	bt/s	40f		! if it's too small, set a byte at once
21*4882a593Smuzhiyun	 mov	r4,r0
22*4882a593Smuzhiyun	and	#3,r0
23*4882a593Smuzhiyun	cmp/eq	#0,r0
24*4882a593Smuzhiyun	bt/s	2f		! It's aligned
25*4882a593Smuzhiyun	 sub	r0,r6
26*4882a593Smuzhiyun1:
27*4882a593Smuzhiyun	dt	r0
28*4882a593Smuzhiyun	bf/s	1b
29*4882a593Smuzhiyun	 mov.b	r5,@-r4
30*4882a593Smuzhiyun2:				! make VVVV
31*4882a593Smuzhiyun	extu.b	r5,r5
32*4882a593Smuzhiyun	swap.b	r5,r0		!   V0
33*4882a593Smuzhiyun	or	r0,r5		!   VV
34*4882a593Smuzhiyun	swap.w	r5,r0		! VV00
35*4882a593Smuzhiyun	or	r0,r5		! VVVV
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun	! Check if enough bytes need to be copied to be worth the big loop
38*4882a593Smuzhiyun	mov	#0x40, r0	! (MT)
39*4882a593Smuzhiyun	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun	bt/s	22f
42*4882a593Smuzhiyun	 mov	r6,r0
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun	! align the dst to the cache block size if necessary
45*4882a593Smuzhiyun	mov	r4, r3
46*4882a593Smuzhiyun	mov	#~(0x1f), r1
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun	and	r3, r1
49*4882a593Smuzhiyun	cmp/eq	r3, r1
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun	bt/s	11f		! dst is already aligned
52*4882a593Smuzhiyun	 sub	r1, r3		! r3-r1 -> r3
53*4882a593Smuzhiyun	shlr2	r3		! number of loops
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun10:	mov.l	r5,@-r4
56*4882a593Smuzhiyun	dt	r3
57*4882a593Smuzhiyun	bf/s	10b
58*4882a593Smuzhiyun	 add	#-4, r6
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun11:	! dst is 32byte aligned
61*4882a593Smuzhiyun	mov	r6,r2
62*4882a593Smuzhiyun	mov	#-5,r0
63*4882a593Smuzhiyun	shld	r0,r2		! number of loops
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun	add	#-32, r4
66*4882a593Smuzhiyun	mov	r5, r0
67*4882a593Smuzhiyun12:
68*4882a593Smuzhiyun	movca.l	r0,@r4
69*4882a593Smuzhiyun	mov.l	r5,@(4, r4)
70*4882a593Smuzhiyun	mov.l	r5,@(8, r4)
71*4882a593Smuzhiyun	mov.l	r5,@(12,r4)
72*4882a593Smuzhiyun	mov.l	r5,@(16,r4)
73*4882a593Smuzhiyun	mov.l	r5,@(20,r4)
74*4882a593Smuzhiyun	add	#-0x20, r6
75*4882a593Smuzhiyun	mov.l	r5,@(24,r4)
76*4882a593Smuzhiyun	dt	r2
77*4882a593Smuzhiyun	mov.l	r5,@(28,r4)
78*4882a593Smuzhiyun	bf/s	12b
79*4882a593Smuzhiyun	 add	#-32, r4
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun	add	#32, r4
82*4882a593Smuzhiyun	mov	#8, r0
83*4882a593Smuzhiyun	cmp/ge	r0, r6
84*4882a593Smuzhiyun	bf	40f
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun	mov	r6,r0
87*4882a593Smuzhiyun22:
88*4882a593Smuzhiyun	shlr2	r0
89*4882a593Smuzhiyun	shlr	r0		! r0 = r6 >> 3
90*4882a593Smuzhiyun3:
91*4882a593Smuzhiyun	dt	r0
92*4882a593Smuzhiyun	mov.l	r5,@-r4		! set 8-byte at once
93*4882a593Smuzhiyun	bf/s	3b
94*4882a593Smuzhiyun	 mov.l	r5,@-r4
95*4882a593Smuzhiyun	!
96*4882a593Smuzhiyun	mov	#7,r0
97*4882a593Smuzhiyun	and	r0,r6
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun	! fill bytes (length may be zero)
100*4882a593Smuzhiyun40:	tst	r6,r6
101*4882a593Smuzhiyun	bt	5f
102*4882a593Smuzhiyun4:
103*4882a593Smuzhiyun	dt	r6
104*4882a593Smuzhiyun	bf/s	4b
105*4882a593Smuzhiyun	 mov.b	r5,@-r4
106*4882a593Smuzhiyun5:
107*4882a593Smuzhiyun	rts
108*4882a593Smuzhiyun	 mov	r4,r0
109