xref: /OK3568_Linux_fs/kernel/arch/x86/include/asm/xor_avx.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun #ifndef _ASM_X86_XOR_AVX_H
3*4882a593Smuzhiyun #define _ASM_X86_XOR_AVX_H
4*4882a593Smuzhiyun 
5*4882a593Smuzhiyun /*
6*4882a593Smuzhiyun  * Optimized RAID-5 checksumming functions for AVX
7*4882a593Smuzhiyun  *
8*4882a593Smuzhiyun  * Copyright (C) 2012 Intel Corporation
9*4882a593Smuzhiyun  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10*4882a593Smuzhiyun  *
11*4882a593Smuzhiyun  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12*4882a593Smuzhiyun  */
13*4882a593Smuzhiyun 
14*4882a593Smuzhiyun #include <linux/compiler.h>
15*4882a593Smuzhiyun #include <asm/fpu/api.h>
16*4882a593Smuzhiyun 
17*4882a593Smuzhiyun #define BLOCK4(i) \
18*4882a593Smuzhiyun 		BLOCK(32 * i, 0) \
19*4882a593Smuzhiyun 		BLOCK(32 * (i + 1), 1) \
20*4882a593Smuzhiyun 		BLOCK(32 * (i + 2), 2) \
21*4882a593Smuzhiyun 		BLOCK(32 * (i + 3), 3)
22*4882a593Smuzhiyun 
23*4882a593Smuzhiyun #define BLOCK16() \
24*4882a593Smuzhiyun 		BLOCK4(0) \
25*4882a593Smuzhiyun 		BLOCK4(4) \
26*4882a593Smuzhiyun 		BLOCK4(8) \
27*4882a593Smuzhiyun 		BLOCK4(12)
28*4882a593Smuzhiyun 
xor_avx_2(unsigned long bytes,unsigned long * p0,unsigned long * p1)29*4882a593Smuzhiyun static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
30*4882a593Smuzhiyun {
31*4882a593Smuzhiyun 	unsigned long lines = bytes >> 9;
32*4882a593Smuzhiyun 
33*4882a593Smuzhiyun 	kernel_fpu_begin();
34*4882a593Smuzhiyun 
35*4882a593Smuzhiyun 	while (lines--) {
36*4882a593Smuzhiyun #undef BLOCK
37*4882a593Smuzhiyun #define BLOCK(i, reg) \
38*4882a593Smuzhiyun do { \
39*4882a593Smuzhiyun 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
40*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
41*4882a593Smuzhiyun 		"m" (p0[i / sizeof(*p0)])); \
42*4882a593Smuzhiyun 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
43*4882a593Smuzhiyun 		"=m" (p0[i / sizeof(*p0)])); \
44*4882a593Smuzhiyun } while (0);
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun 		BLOCK16()
47*4882a593Smuzhiyun 
48*4882a593Smuzhiyun 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
49*4882a593Smuzhiyun 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
50*4882a593Smuzhiyun 	}
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun 	kernel_fpu_end();
53*4882a593Smuzhiyun }
54*4882a593Smuzhiyun 
xor_avx_3(unsigned long bytes,unsigned long * p0,unsigned long * p1,unsigned long * p2)55*4882a593Smuzhiyun static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
56*4882a593Smuzhiyun 	unsigned long *p2)
57*4882a593Smuzhiyun {
58*4882a593Smuzhiyun 	unsigned long lines = bytes >> 9;
59*4882a593Smuzhiyun 
60*4882a593Smuzhiyun 	kernel_fpu_begin();
61*4882a593Smuzhiyun 
62*4882a593Smuzhiyun 	while (lines--) {
63*4882a593Smuzhiyun #undef BLOCK
64*4882a593Smuzhiyun #define BLOCK(i, reg) \
65*4882a593Smuzhiyun do { \
66*4882a593Smuzhiyun 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
67*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
68*4882a593Smuzhiyun 		"m" (p1[i / sizeof(*p1)])); \
69*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70*4882a593Smuzhiyun 		"m" (p0[i / sizeof(*p0)])); \
71*4882a593Smuzhiyun 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
72*4882a593Smuzhiyun 		"=m" (p0[i / sizeof(*p0)])); \
73*4882a593Smuzhiyun } while (0);
74*4882a593Smuzhiyun 
75*4882a593Smuzhiyun 		BLOCK16()
76*4882a593Smuzhiyun 
77*4882a593Smuzhiyun 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
78*4882a593Smuzhiyun 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
79*4882a593Smuzhiyun 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
80*4882a593Smuzhiyun 	}
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun 	kernel_fpu_end();
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun 
xor_avx_4(unsigned long bytes,unsigned long * p0,unsigned long * p1,unsigned long * p2,unsigned long * p3)85*4882a593Smuzhiyun static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
86*4882a593Smuzhiyun 	unsigned long *p2, unsigned long *p3)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun 	unsigned long lines = bytes >> 9;
89*4882a593Smuzhiyun 
90*4882a593Smuzhiyun 	kernel_fpu_begin();
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun 	while (lines--) {
93*4882a593Smuzhiyun #undef BLOCK
94*4882a593Smuzhiyun #define BLOCK(i, reg) \
95*4882a593Smuzhiyun do { \
96*4882a593Smuzhiyun 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
97*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
98*4882a593Smuzhiyun 		"m" (p2[i / sizeof(*p2)])); \
99*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
100*4882a593Smuzhiyun 		"m" (p1[i / sizeof(*p1)])); \
101*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102*4882a593Smuzhiyun 		"m" (p0[i / sizeof(*p0)])); \
103*4882a593Smuzhiyun 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
104*4882a593Smuzhiyun 		"=m" (p0[i / sizeof(*p0)])); \
105*4882a593Smuzhiyun } while (0);
106*4882a593Smuzhiyun 
107*4882a593Smuzhiyun 		BLOCK16();
108*4882a593Smuzhiyun 
109*4882a593Smuzhiyun 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
110*4882a593Smuzhiyun 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
111*4882a593Smuzhiyun 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
112*4882a593Smuzhiyun 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
113*4882a593Smuzhiyun 	}
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun 	kernel_fpu_end();
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun 
xor_avx_5(unsigned long bytes,unsigned long * p0,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)118*4882a593Smuzhiyun static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
119*4882a593Smuzhiyun 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun 	unsigned long lines = bytes >> 9;
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	kernel_fpu_begin();
124*4882a593Smuzhiyun 
125*4882a593Smuzhiyun 	while (lines--) {
126*4882a593Smuzhiyun #undef BLOCK
127*4882a593Smuzhiyun #define BLOCK(i, reg) \
128*4882a593Smuzhiyun do { \
129*4882a593Smuzhiyun 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
130*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
131*4882a593Smuzhiyun 		"m" (p3[i / sizeof(*p3)])); \
132*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133*4882a593Smuzhiyun 		"m" (p2[i / sizeof(*p2)])); \
134*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135*4882a593Smuzhiyun 		"m" (p1[i / sizeof(*p1)])); \
136*4882a593Smuzhiyun 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137*4882a593Smuzhiyun 		"m" (p0[i / sizeof(*p0)])); \
138*4882a593Smuzhiyun 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139*4882a593Smuzhiyun 		"=m" (p0[i / sizeof(*p0)])); \
140*4882a593Smuzhiyun } while (0);
141*4882a593Smuzhiyun 
142*4882a593Smuzhiyun 		BLOCK16()
143*4882a593Smuzhiyun 
144*4882a593Smuzhiyun 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
145*4882a593Smuzhiyun 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
146*4882a593Smuzhiyun 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
147*4882a593Smuzhiyun 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
148*4882a593Smuzhiyun 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
149*4882a593Smuzhiyun 	}
150*4882a593Smuzhiyun 
151*4882a593Smuzhiyun 	kernel_fpu_end();
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun static struct xor_block_template xor_block_avx = {
155*4882a593Smuzhiyun 	.name = "avx",
156*4882a593Smuzhiyun 	.do_2 = xor_avx_2,
157*4882a593Smuzhiyun 	.do_3 = xor_avx_3,
158*4882a593Smuzhiyun 	.do_4 = xor_avx_4,
159*4882a593Smuzhiyun 	.do_5 = xor_avx_5,
160*4882a593Smuzhiyun };
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun #define AVX_XOR_SPEED \
163*4882a593Smuzhiyun do { \
164*4882a593Smuzhiyun 	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
165*4882a593Smuzhiyun 		xor_speed(&xor_block_avx); \
166*4882a593Smuzhiyun } while (0)
167*4882a593Smuzhiyun 
168*4882a593Smuzhiyun #define AVX_SELECT(FASTEST) \
169*4882a593Smuzhiyun 	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
170*4882a593Smuzhiyun 
171*4882a593Smuzhiyun #endif
172