xref: /OK3568_Linux_fs/kernel/lib/raid6/recov_avx512.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright (C) 2016 Intel Corporation
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Author: Gayatri Kammela <gayatri.kammela@intel.com>
6*4882a593Smuzhiyun  * Author: Megha Dey <megha.dey@linux.intel.com>
7*4882a593Smuzhiyun  */
8*4882a593Smuzhiyun 
9*4882a593Smuzhiyun #ifdef CONFIG_AS_AVX512
10*4882a593Smuzhiyun 
11*4882a593Smuzhiyun #include <linux/raid/pq.h>
12*4882a593Smuzhiyun #include "x86.h"
13*4882a593Smuzhiyun 
raid6_has_avx512(void)14*4882a593Smuzhiyun static int raid6_has_avx512(void)
15*4882a593Smuzhiyun {
16*4882a593Smuzhiyun 	return boot_cpu_has(X86_FEATURE_AVX2) &&
17*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_AVX) &&
18*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_AVX512F) &&
19*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_AVX512BW) &&
20*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_AVX512VL) &&
21*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_AVX512DQ);
22*4882a593Smuzhiyun }
23*4882a593Smuzhiyun 
raid6_2data_recov_avx512(int disks,size_t bytes,int faila,int failb,void ** ptrs)24*4882a593Smuzhiyun static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
25*4882a593Smuzhiyun 				     int failb, void **ptrs)
26*4882a593Smuzhiyun {
27*4882a593Smuzhiyun 	u8 *p, *q, *dp, *dq;
28*4882a593Smuzhiyun 	const u8 *pbmul;	/* P multiplier table for B data */
29*4882a593Smuzhiyun 	const u8 *qmul;		/* Q multiplier table (for both) */
30*4882a593Smuzhiyun 	const u8 x0f = 0x0f;
31*4882a593Smuzhiyun 
32*4882a593Smuzhiyun 	p = (u8 *)ptrs[disks-2];
33*4882a593Smuzhiyun 	q = (u8 *)ptrs[disks-1];
34*4882a593Smuzhiyun 
35*4882a593Smuzhiyun 	/*
36*4882a593Smuzhiyun 	 * Compute syndrome with zero for the missing data pages
37*4882a593Smuzhiyun 	 * Use the dead data pages as temporary storage for
38*4882a593Smuzhiyun 	 * delta p and delta q
39*4882a593Smuzhiyun 	 */
40*4882a593Smuzhiyun 
41*4882a593Smuzhiyun 	dp = (u8 *)ptrs[faila];
42*4882a593Smuzhiyun 	ptrs[faila] = (void *)raid6_empty_zero_page;
43*4882a593Smuzhiyun 	ptrs[disks-2] = dp;
44*4882a593Smuzhiyun 	dq = (u8 *)ptrs[failb];
45*4882a593Smuzhiyun 	ptrs[failb] = (void *)raid6_empty_zero_page;
46*4882a593Smuzhiyun 	ptrs[disks-1] = dq;
47*4882a593Smuzhiyun 
48*4882a593Smuzhiyun 	raid6_call.gen_syndrome(disks, bytes, ptrs);
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun 	/* Restore pointer table */
51*4882a593Smuzhiyun 	ptrs[faila]   = dp;
52*4882a593Smuzhiyun 	ptrs[failb]   = dq;
53*4882a593Smuzhiyun 	ptrs[disks-2] = p;
54*4882a593Smuzhiyun 	ptrs[disks-1] = q;
55*4882a593Smuzhiyun 
56*4882a593Smuzhiyun 	/* Now, pick the proper data tables */
57*4882a593Smuzhiyun 	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
58*4882a593Smuzhiyun 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
59*4882a593Smuzhiyun 		raid6_gfexp[failb]]];
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun 	kernel_fpu_begin();
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun 	/* zmm0 = x0f[16] */
64*4882a593Smuzhiyun 	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
65*4882a593Smuzhiyun 
66*4882a593Smuzhiyun 	while (bytes) {
67*4882a593Smuzhiyun #ifdef CONFIG_X86_64
68*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
69*4882a593Smuzhiyun 			     "vmovdqa64 %1, %%zmm9\n\t"
70*4882a593Smuzhiyun 			     "vmovdqa64 %2, %%zmm0\n\t"
71*4882a593Smuzhiyun 			     "vmovdqa64 %3, %%zmm8\n\t"
72*4882a593Smuzhiyun 			     "vpxorq %4, %%zmm1, %%zmm1\n\t"
73*4882a593Smuzhiyun 			     "vpxorq %5, %%zmm9, %%zmm9\n\t"
74*4882a593Smuzhiyun 			     "vpxorq %6, %%zmm0, %%zmm0\n\t"
75*4882a593Smuzhiyun 			     "vpxorq %7, %%zmm8, %%zmm8"
76*4882a593Smuzhiyun 			     :
77*4882a593Smuzhiyun 			     : "m" (q[0]), "m" (q[64]), "m" (p[0]),
78*4882a593Smuzhiyun 			       "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
79*4882a593Smuzhiyun 			       "m" (dp[0]), "m" (dp[64]));
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun 		/*
82*4882a593Smuzhiyun 		 * 1 = dq[0]  ^ q[0]
83*4882a593Smuzhiyun 		 * 9 = dq[64] ^ q[64]
84*4882a593Smuzhiyun 		 * 0 = dp[0]  ^ p[0]
85*4882a593Smuzhiyun 		 * 8 = dp[64] ^ p[64]
86*4882a593Smuzhiyun 		 */
87*4882a593Smuzhiyun 
88*4882a593Smuzhiyun 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
89*4882a593Smuzhiyun 			     "vbroadcasti64x2 %1, %%zmm5"
90*4882a593Smuzhiyun 			     :
91*4882a593Smuzhiyun 			     : "m" (qmul[0]), "m" (qmul[16]));
92*4882a593Smuzhiyun 
93*4882a593Smuzhiyun 		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
94*4882a593Smuzhiyun 			     "vpsraw $4, %%zmm9, %%zmm12\n\t"
95*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
96*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
97*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
98*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
99*4882a593Smuzhiyun 			     "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
100*4882a593Smuzhiyun 			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
101*4882a593Smuzhiyun 			     "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
102*4882a593Smuzhiyun 			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
103*4882a593Smuzhiyun 			     "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
104*4882a593Smuzhiyun 			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
105*4882a593Smuzhiyun 			     :
106*4882a593Smuzhiyun 			     : );
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun 		/*
109*4882a593Smuzhiyun 		 * 5 = qx[0]
110*4882a593Smuzhiyun 		 * 15 = qx[64]
111*4882a593Smuzhiyun 		 */
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
114*4882a593Smuzhiyun 			     "vbroadcasti64x2 %1, %%zmm1\n\t"
115*4882a593Smuzhiyun 			     "vpsraw $4, %%zmm0, %%zmm2\n\t"
116*4882a593Smuzhiyun 			     "vpsraw $4, %%zmm8, %%zmm6\n\t"
117*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
118*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
119*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
120*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
121*4882a593Smuzhiyun 			     "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
122*4882a593Smuzhiyun 			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
123*4882a593Smuzhiyun 			     "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
124*4882a593Smuzhiyun 			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
125*4882a593Smuzhiyun 			     "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
126*4882a593Smuzhiyun 			     "vpxorq %%zmm12, %%zmm13, %%zmm13"
127*4882a593Smuzhiyun 			     :
128*4882a593Smuzhiyun 			     : "m" (pbmul[0]), "m" (pbmul[16]));
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun 		/*
131*4882a593Smuzhiyun 		 * 1  = pbmul[px[0]]
132*4882a593Smuzhiyun 		 * 13 = pbmul[px[64]]
133*4882a593Smuzhiyun 		 */
134*4882a593Smuzhiyun 		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
135*4882a593Smuzhiyun 			     "vpxorq %%zmm15, %%zmm13, %%zmm13"
136*4882a593Smuzhiyun 			     :
137*4882a593Smuzhiyun 			     : );
138*4882a593Smuzhiyun 
139*4882a593Smuzhiyun 		/*
140*4882a593Smuzhiyun 		 * 1 = db = DQ
141*4882a593Smuzhiyun 		 * 13 = db[64] = DQ[64]
142*4882a593Smuzhiyun 		 */
143*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
144*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm13,%1\n\t"
145*4882a593Smuzhiyun 			     "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
146*4882a593Smuzhiyun 			     "vpxorq %%zmm13, %%zmm8, %%zmm8"
147*4882a593Smuzhiyun 			     :
148*4882a593Smuzhiyun 			     : "m" (dq[0]), "m" (dq[64]));
149*4882a593Smuzhiyun 
150*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %%zmm0, %0\n\t"
151*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm8, %1"
152*4882a593Smuzhiyun 			     :
153*4882a593Smuzhiyun 			     : "m" (dp[0]), "m" (dp[64]));
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 		bytes -= 128;
156*4882a593Smuzhiyun 		p += 128;
157*4882a593Smuzhiyun 		q += 128;
158*4882a593Smuzhiyun 		dp += 128;
159*4882a593Smuzhiyun 		dq += 128;
160*4882a593Smuzhiyun #else
161*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
162*4882a593Smuzhiyun 			     "vmovdqa64 %1, %%zmm0\n\t"
163*4882a593Smuzhiyun 			     "vpxorq %2, %%zmm1, %%zmm1\n\t"
164*4882a593Smuzhiyun 			     "vpxorq %3, %%zmm0, %%zmm0"
165*4882a593Smuzhiyun 			     :
166*4882a593Smuzhiyun 			     : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
167*4882a593Smuzhiyun 
168*4882a593Smuzhiyun 		/* 1 = dq ^ q;  0 = dp ^ p */
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
171*4882a593Smuzhiyun 			     "vbroadcasti64x2 %1, %%zmm5"
172*4882a593Smuzhiyun 			     :
173*4882a593Smuzhiyun 			     : "m" (qmul[0]), "m" (qmul[16]));
174*4882a593Smuzhiyun 
175*4882a593Smuzhiyun 		/*
176*4882a593Smuzhiyun 		 * 1 = dq ^ q
177*4882a593Smuzhiyun 		 * 3 = dq ^ p >> 4
178*4882a593Smuzhiyun 		 */
179*4882a593Smuzhiyun 		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
180*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
181*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
182*4882a593Smuzhiyun 			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
183*4882a593Smuzhiyun 			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
184*4882a593Smuzhiyun 			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
185*4882a593Smuzhiyun 			     :
186*4882a593Smuzhiyun 			     : );
187*4882a593Smuzhiyun 
188*4882a593Smuzhiyun 		/* 5 = qx */
189*4882a593Smuzhiyun 
190*4882a593Smuzhiyun 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
191*4882a593Smuzhiyun 			     "vbroadcasti64x2 %1, %%zmm1"
192*4882a593Smuzhiyun 			     :
193*4882a593Smuzhiyun 			     : "m" (pbmul[0]), "m" (pbmul[16]));
194*4882a593Smuzhiyun 
195*4882a593Smuzhiyun 		asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
196*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
197*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
198*4882a593Smuzhiyun 			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
199*4882a593Smuzhiyun 			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
200*4882a593Smuzhiyun 			     "vpxorq %%zmm4, %%zmm1, %%zmm1"
201*4882a593Smuzhiyun 			     :
202*4882a593Smuzhiyun 			     : );
203*4882a593Smuzhiyun 
204*4882a593Smuzhiyun 		/* 1 = pbmul[px] */
205*4882a593Smuzhiyun 		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
206*4882a593Smuzhiyun 			     /* 1 = db = DQ */
207*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm1, %0\n\t"
208*4882a593Smuzhiyun 			     :
209*4882a593Smuzhiyun 			     : "m" (dq[0]));
210*4882a593Smuzhiyun 
211*4882a593Smuzhiyun 		asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
212*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm0, %0"
213*4882a593Smuzhiyun 			     :
214*4882a593Smuzhiyun 			     : "m" (dp[0]));
215*4882a593Smuzhiyun 
216*4882a593Smuzhiyun 		bytes -= 64;
217*4882a593Smuzhiyun 		p += 64;
218*4882a593Smuzhiyun 		q += 64;
219*4882a593Smuzhiyun 		dp += 64;
220*4882a593Smuzhiyun 		dq += 64;
221*4882a593Smuzhiyun #endif
222*4882a593Smuzhiyun 	}
223*4882a593Smuzhiyun 
224*4882a593Smuzhiyun 	kernel_fpu_end();
225*4882a593Smuzhiyun }
226*4882a593Smuzhiyun 
raid6_datap_recov_avx512(int disks,size_t bytes,int faila,void ** ptrs)227*4882a593Smuzhiyun static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
228*4882a593Smuzhiyun 				     void **ptrs)
229*4882a593Smuzhiyun {
230*4882a593Smuzhiyun 	u8 *p, *q, *dq;
231*4882a593Smuzhiyun 	const u8 *qmul;		/* Q multiplier table */
232*4882a593Smuzhiyun 	const u8 x0f = 0x0f;
233*4882a593Smuzhiyun 
234*4882a593Smuzhiyun 	p = (u8 *)ptrs[disks-2];
235*4882a593Smuzhiyun 	q = (u8 *)ptrs[disks-1];
236*4882a593Smuzhiyun 
237*4882a593Smuzhiyun 	/*
238*4882a593Smuzhiyun 	 * Compute syndrome with zero for the missing data page
239*4882a593Smuzhiyun 	 * Use the dead data page as temporary storage for delta q
240*4882a593Smuzhiyun 	 */
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun 	dq = (u8 *)ptrs[faila];
243*4882a593Smuzhiyun 	ptrs[faila] = (void *)raid6_empty_zero_page;
244*4882a593Smuzhiyun 	ptrs[disks-1] = dq;
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun 	raid6_call.gen_syndrome(disks, bytes, ptrs);
247*4882a593Smuzhiyun 
248*4882a593Smuzhiyun 	/* Restore pointer table */
249*4882a593Smuzhiyun 	ptrs[faila]   = dq;
250*4882a593Smuzhiyun 	ptrs[disks-1] = q;
251*4882a593Smuzhiyun 
252*4882a593Smuzhiyun 	/* Now, pick the proper data tables */
253*4882a593Smuzhiyun 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
254*4882a593Smuzhiyun 
255*4882a593Smuzhiyun 	kernel_fpu_begin();
256*4882a593Smuzhiyun 
257*4882a593Smuzhiyun 	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
258*4882a593Smuzhiyun 
259*4882a593Smuzhiyun 	while (bytes) {
260*4882a593Smuzhiyun #ifdef CONFIG_X86_64
261*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
262*4882a593Smuzhiyun 			     "vmovdqa64 %1, %%zmm8\n\t"
263*4882a593Smuzhiyun 			     "vpxorq %2, %%zmm3, %%zmm3\n\t"
264*4882a593Smuzhiyun 			     "vpxorq %3, %%zmm8, %%zmm8"
265*4882a593Smuzhiyun 			     :
266*4882a593Smuzhiyun 			     : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
267*4882a593Smuzhiyun 			       "m" (q[64]));
268*4882a593Smuzhiyun 
269*4882a593Smuzhiyun 		/*
270*4882a593Smuzhiyun 		 * 3 = q[0] ^ dq[0]
271*4882a593Smuzhiyun 		 * 8 = q[64] ^ dq[64]
272*4882a593Smuzhiyun 		 */
273*4882a593Smuzhiyun 		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
274*4882a593Smuzhiyun 			     "vmovapd %%zmm0, %%zmm13\n\t"
275*4882a593Smuzhiyun 			     "vbroadcasti64x2 %1, %%zmm1\n\t"
276*4882a593Smuzhiyun 			     "vmovapd %%zmm1, %%zmm14"
277*4882a593Smuzhiyun 			     :
278*4882a593Smuzhiyun 			     : "m" (qmul[0]), "m" (qmul[16]));
279*4882a593Smuzhiyun 
280*4882a593Smuzhiyun 		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
281*4882a593Smuzhiyun 			     "vpsraw $4, %%zmm8, %%zmm12\n\t"
282*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
283*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
284*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
285*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
286*4882a593Smuzhiyun 			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
287*4882a593Smuzhiyun 			     "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
288*4882a593Smuzhiyun 			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
289*4882a593Smuzhiyun 			     "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
290*4882a593Smuzhiyun 			     "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
291*4882a593Smuzhiyun 			     "vpxorq %%zmm13, %%zmm14, %%zmm14"
292*4882a593Smuzhiyun 			     :
293*4882a593Smuzhiyun 			     : );
294*4882a593Smuzhiyun 
295*4882a593Smuzhiyun 		/*
296*4882a593Smuzhiyun 		 * 1  = qmul[q[0]  ^ dq[0]]
297*4882a593Smuzhiyun 		 * 14 = qmul[q[64] ^ dq[64]]
298*4882a593Smuzhiyun 		 */
299*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
300*4882a593Smuzhiyun 			     "vmovdqa64 %1, %%zmm12\n\t"
301*4882a593Smuzhiyun 			     "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
302*4882a593Smuzhiyun 			     "vpxorq %%zmm14, %%zmm12, %%zmm12"
303*4882a593Smuzhiyun 			     :
304*4882a593Smuzhiyun 			     : "m" (p[0]), "m" (p[64]));
305*4882a593Smuzhiyun 
306*4882a593Smuzhiyun 		/*
307*4882a593Smuzhiyun 		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
308*4882a593Smuzhiyun 		 * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
309*4882a593Smuzhiyun 		 */
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
312*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm14, %1\n\t"
313*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm2, %2\n\t"
314*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm12,%3"
315*4882a593Smuzhiyun 			     :
316*4882a593Smuzhiyun 			     : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
317*4882a593Smuzhiyun 			       "m" (p[64]));
318*4882a593Smuzhiyun 
319*4882a593Smuzhiyun 		bytes -= 128;
320*4882a593Smuzhiyun 		p += 128;
321*4882a593Smuzhiyun 		q += 128;
322*4882a593Smuzhiyun 		dq += 128;
323*4882a593Smuzhiyun #else
324*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
325*4882a593Smuzhiyun 			     "vpxorq %1, %%zmm3, %%zmm3"
326*4882a593Smuzhiyun 			     :
327*4882a593Smuzhiyun 			     : "m" (dq[0]), "m" (q[0]));
328*4882a593Smuzhiyun 
329*4882a593Smuzhiyun 		/* 3 = q ^ dq */
330*4882a593Smuzhiyun 
331*4882a593Smuzhiyun 		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
332*4882a593Smuzhiyun 			     "vbroadcasti64x2 %1, %%zmm1"
333*4882a593Smuzhiyun 			     :
334*4882a593Smuzhiyun 			     : "m" (qmul[0]), "m" (qmul[16]));
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun 		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
337*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
338*4882a593Smuzhiyun 			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
339*4882a593Smuzhiyun 			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
340*4882a593Smuzhiyun 			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
341*4882a593Smuzhiyun 			     "vpxorq %%zmm0, %%zmm1, %%zmm1"
342*4882a593Smuzhiyun 			     :
343*4882a593Smuzhiyun 			     : );
344*4882a593Smuzhiyun 
345*4882a593Smuzhiyun 		/* 1 = qmul[q ^ dq] */
346*4882a593Smuzhiyun 
347*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
348*4882a593Smuzhiyun 			     "vpxorq %%zmm1, %%zmm2, %%zmm2"
349*4882a593Smuzhiyun 			     :
350*4882a593Smuzhiyun 			     : "m" (p[0]));
351*4882a593Smuzhiyun 
352*4882a593Smuzhiyun 		/* 2 = p ^ qmul[q ^ dq] */
353*4882a593Smuzhiyun 
354*4882a593Smuzhiyun 		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
355*4882a593Smuzhiyun 			     "vmovdqa64 %%zmm2, %1"
356*4882a593Smuzhiyun 			     :
357*4882a593Smuzhiyun 			     : "m" (dq[0]), "m" (p[0]));
358*4882a593Smuzhiyun 
359*4882a593Smuzhiyun 		bytes -= 64;
360*4882a593Smuzhiyun 		p += 64;
361*4882a593Smuzhiyun 		q += 64;
362*4882a593Smuzhiyun 		dq += 64;
363*4882a593Smuzhiyun #endif
364*4882a593Smuzhiyun 	}
365*4882a593Smuzhiyun 
366*4882a593Smuzhiyun 	kernel_fpu_end();
367*4882a593Smuzhiyun }
368*4882a593Smuzhiyun 
369*4882a593Smuzhiyun const struct raid6_recov_calls raid6_recov_avx512 = {
370*4882a593Smuzhiyun 	.data2 = raid6_2data_recov_avx512,
371*4882a593Smuzhiyun 	.datap = raid6_datap_recov_avx512,
372*4882a593Smuzhiyun 	.valid = raid6_has_avx512,
373*4882a593Smuzhiyun #ifdef CONFIG_X86_64
374*4882a593Smuzhiyun 	.name = "avx512x2",
375*4882a593Smuzhiyun #else
376*4882a593Smuzhiyun 	.name = "avx512x1",
377*4882a593Smuzhiyun #endif
378*4882a593Smuzhiyun 	.priority = 3,
379*4882a593Smuzhiyun };
380*4882a593Smuzhiyun 
381*4882a593Smuzhiyun #else
382*4882a593Smuzhiyun #warning "your version of binutils lacks AVX512 support"
383*4882a593Smuzhiyun #endif
384