xref: /OK3568_Linux_fs/kernel/arch/openrisc/lib/memcpy.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * arch/openrisc/lib/memcpy.c
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Optimized memory copy routines for openrisc.  These are mostly copied
6*4882a593Smuzhiyun  * from ohter sources but slightly entended based on ideas discuassed in
7*4882a593Smuzhiyun  * #openrisc.
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  * The word unroll implementation is an extension to the arm byte
10*4882a593Smuzhiyun  * unrolled implementation, but using word copies (if things are
11*4882a593Smuzhiyun  * properly aligned)
12*4882a593Smuzhiyun  *
13*4882a593Smuzhiyun  * The great arm loop unroll algorithm can be found at:
14*4882a593Smuzhiyun  *  arch/arm/boot/compressed/string.c
15*4882a593Smuzhiyun  */
16*4882a593Smuzhiyun 
17*4882a593Smuzhiyun #include <linux/export.h>
18*4882a593Smuzhiyun 
19*4882a593Smuzhiyun #include <linux/string.h>
20*4882a593Smuzhiyun 
21*4882a593Smuzhiyun #ifdef CONFIG_OR1K_1200
22*4882a593Smuzhiyun /*
23*4882a593Smuzhiyun  * Do memcpy with word copies and loop unrolling. This gives the
24*4882a593Smuzhiyun  * best performance on the OR1200 and MOR1KX archirectures
25*4882a593Smuzhiyun  */
memcpy(void * dest,__const void * src,__kernel_size_t n)26*4882a593Smuzhiyun void *memcpy(void *dest, __const void *src, __kernel_size_t n)
27*4882a593Smuzhiyun {
28*4882a593Smuzhiyun 	int i = 0;
29*4882a593Smuzhiyun 	unsigned char *d, *s;
30*4882a593Smuzhiyun 	uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
31*4882a593Smuzhiyun 
32*4882a593Smuzhiyun 	/* If both source and dest are word aligned copy words */
33*4882a593Smuzhiyun 	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
34*4882a593Smuzhiyun 		/* Copy 32 bytes per loop */
35*4882a593Smuzhiyun 		for (i = n >> 5; i > 0; i--) {
36*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
37*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
38*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
39*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
40*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
41*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
42*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
43*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
44*4882a593Smuzhiyun 		}
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun 		if (n & 1 << 4) {
47*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
48*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
49*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
50*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
51*4882a593Smuzhiyun 		}
52*4882a593Smuzhiyun 
53*4882a593Smuzhiyun 		if (n & 1 << 3) {
54*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
55*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
56*4882a593Smuzhiyun 		}
57*4882a593Smuzhiyun 
58*4882a593Smuzhiyun 		if (n & 1 << 2)
59*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun 		d = (unsigned char *)dest_w;
62*4882a593Smuzhiyun 		s = (unsigned char *)src_w;
63*4882a593Smuzhiyun 
64*4882a593Smuzhiyun 	} else {
65*4882a593Smuzhiyun 		d = (unsigned char *)dest_w;
66*4882a593Smuzhiyun 		s = (unsigned char *)src_w;
67*4882a593Smuzhiyun 
68*4882a593Smuzhiyun 		for (i = n >> 3; i > 0; i--) {
69*4882a593Smuzhiyun 			*d++ = *s++;
70*4882a593Smuzhiyun 			*d++ = *s++;
71*4882a593Smuzhiyun 			*d++ = *s++;
72*4882a593Smuzhiyun 			*d++ = *s++;
73*4882a593Smuzhiyun 			*d++ = *s++;
74*4882a593Smuzhiyun 			*d++ = *s++;
75*4882a593Smuzhiyun 			*d++ = *s++;
76*4882a593Smuzhiyun 			*d++ = *s++;
77*4882a593Smuzhiyun 		}
78*4882a593Smuzhiyun 
79*4882a593Smuzhiyun 		if (n & 1 << 2) {
80*4882a593Smuzhiyun 			*d++ = *s++;
81*4882a593Smuzhiyun 			*d++ = *s++;
82*4882a593Smuzhiyun 			*d++ = *s++;
83*4882a593Smuzhiyun 			*d++ = *s++;
84*4882a593Smuzhiyun 		}
85*4882a593Smuzhiyun 	}
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun 	if (n & 1 << 1) {
88*4882a593Smuzhiyun 		*d++ = *s++;
89*4882a593Smuzhiyun 		*d++ = *s++;
90*4882a593Smuzhiyun 	}
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun 	if (n & 1)
93*4882a593Smuzhiyun 		*d++ = *s++;
94*4882a593Smuzhiyun 
95*4882a593Smuzhiyun 	return dest;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun #else
98*4882a593Smuzhiyun /*
99*4882a593Smuzhiyun  * Use word copies but no loop unrolling as we cannot assume there
100*4882a593Smuzhiyun  * will be benefits on the archirecture
101*4882a593Smuzhiyun  */
memcpy(void * dest,__const void * src,__kernel_size_t n)102*4882a593Smuzhiyun void *memcpy(void *dest, __const void *src, __kernel_size_t n)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun 	unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
105*4882a593Smuzhiyun 	uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
106*4882a593Smuzhiyun 
107*4882a593Smuzhiyun 	/* If both source and dest are word aligned copy words */
108*4882a593Smuzhiyun 	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
109*4882a593Smuzhiyun 		for (; n >= 4; n -= 4)
110*4882a593Smuzhiyun 			*dest_w++ = *src_w++;
111*4882a593Smuzhiyun 	}
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun 	d = (unsigned char *)dest_w;
114*4882a593Smuzhiyun 	s = (unsigned char *)src_w;
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun 	/* For remaining or if not aligned, copy bytes */
117*4882a593Smuzhiyun 	for (; n >= 1; n -= 1)
118*4882a593Smuzhiyun 		*d++ = *s++;
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun 	return dest;
121*4882a593Smuzhiyun 
122*4882a593Smuzhiyun }
123*4882a593Smuzhiyun #endif
124*4882a593Smuzhiyun 
125*4882a593Smuzhiyun EXPORT_SYMBOL(memcpy);
126