1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2012 Intel Corporation
4*4882a593Smuzhiyun * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include <linux/raid/pq.h>
8*4882a593Smuzhiyun #include "x86.h"
9*4882a593Smuzhiyun
raid6_has_avx2(void)10*4882a593Smuzhiyun static int raid6_has_avx2(void)
11*4882a593Smuzhiyun {
12*4882a593Smuzhiyun return boot_cpu_has(X86_FEATURE_AVX2) &&
13*4882a593Smuzhiyun boot_cpu_has(X86_FEATURE_AVX);
14*4882a593Smuzhiyun }
15*4882a593Smuzhiyun
raid6_2data_recov_avx2(int disks,size_t bytes,int faila,int failb,void ** ptrs)16*4882a593Smuzhiyun static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
17*4882a593Smuzhiyun int failb, void **ptrs)
18*4882a593Smuzhiyun {
19*4882a593Smuzhiyun u8 *p, *q, *dp, *dq;
20*4882a593Smuzhiyun const u8 *pbmul; /* P multiplier table for B data */
21*4882a593Smuzhiyun const u8 *qmul; /* Q multiplier table (for both) */
22*4882a593Smuzhiyun const u8 x0f = 0x0f;
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun p = (u8 *)ptrs[disks-2];
25*4882a593Smuzhiyun q = (u8 *)ptrs[disks-1];
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun /* Compute syndrome with zero for the missing data pages
28*4882a593Smuzhiyun Use the dead data pages as temporary storage for
29*4882a593Smuzhiyun delta p and delta q */
30*4882a593Smuzhiyun dp = (u8 *)ptrs[faila];
31*4882a593Smuzhiyun ptrs[faila] = (void *)raid6_empty_zero_page;
32*4882a593Smuzhiyun ptrs[disks-2] = dp;
33*4882a593Smuzhiyun dq = (u8 *)ptrs[failb];
34*4882a593Smuzhiyun ptrs[failb] = (void *)raid6_empty_zero_page;
35*4882a593Smuzhiyun ptrs[disks-1] = dq;
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun raid6_call.gen_syndrome(disks, bytes, ptrs);
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun /* Restore pointer table */
40*4882a593Smuzhiyun ptrs[faila] = dp;
41*4882a593Smuzhiyun ptrs[failb] = dq;
42*4882a593Smuzhiyun ptrs[disks-2] = p;
43*4882a593Smuzhiyun ptrs[disks-1] = q;
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun /* Now, pick the proper data tables */
46*4882a593Smuzhiyun pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
47*4882a593Smuzhiyun qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
48*4882a593Smuzhiyun raid6_gfexp[failb]]];
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun kernel_fpu_begin();
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun /* ymm0 = x0f[16] */
53*4882a593Smuzhiyun asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun while (bytes) {
56*4882a593Smuzhiyun #ifdef CONFIG_X86_64
57*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
58*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
59*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
60*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
61*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0]));
62*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32]));
63*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0]));
64*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32]));
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun /*
67*4882a593Smuzhiyun * 1 = dq[0] ^ q[0]
68*4882a593Smuzhiyun * 9 = dq[32] ^ q[32]
69*4882a593Smuzhiyun * 0 = dp[0] ^ p[0]
70*4882a593Smuzhiyun * 8 = dp[32] ^ p[32]
71*4882a593Smuzhiyun */
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
74*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm1, %ymm3");
77*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm9, %ymm12");
78*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm1, %ymm1");
79*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm9, %ymm9");
80*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm3, %ymm3");
81*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm12, %ymm12");
82*4882a593Smuzhiyun asm volatile("vpshufb %ymm9, %ymm4, %ymm14");
83*4882a593Smuzhiyun asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
84*4882a593Smuzhiyun asm volatile("vpshufb %ymm12, %ymm5, %ymm15");
85*4882a593Smuzhiyun asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
86*4882a593Smuzhiyun asm volatile("vpxor %ymm14, %ymm15, %ymm15");
87*4882a593Smuzhiyun asm volatile("vpxor %ymm4, %ymm5, %ymm5");
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun /*
90*4882a593Smuzhiyun * 5 = qx[0]
91*4882a593Smuzhiyun * 15 = qx[32]
92*4882a593Smuzhiyun */
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
95*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
96*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm0, %ymm2");
97*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm8, %ymm6");
98*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm0, %ymm3");
99*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm8, %ymm14");
100*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm2, %ymm2");
101*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm6, %ymm6");
102*4882a593Smuzhiyun asm volatile("vpshufb %ymm14, %ymm4, %ymm12");
103*4882a593Smuzhiyun asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
104*4882a593Smuzhiyun asm volatile("vpshufb %ymm6, %ymm1, %ymm13");
105*4882a593Smuzhiyun asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
106*4882a593Smuzhiyun asm volatile("vpxor %ymm4, %ymm1, %ymm1");
107*4882a593Smuzhiyun asm volatile("vpxor %ymm12, %ymm13, %ymm13");
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun /*
110*4882a593Smuzhiyun * 1 = pbmul[px[0]]
111*4882a593Smuzhiyun * 13 = pbmul[px[32]]
112*4882a593Smuzhiyun */
113*4882a593Smuzhiyun asm volatile("vpxor %ymm5, %ymm1, %ymm1");
114*4882a593Smuzhiyun asm volatile("vpxor %ymm15, %ymm13, %ymm13");
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun /*
117*4882a593Smuzhiyun * 1 = db = DQ
118*4882a593Smuzhiyun * 13 = db[32] = DQ[32]
119*4882a593Smuzhiyun */
120*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
121*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32]));
122*4882a593Smuzhiyun asm volatile("vpxor %ymm1, %ymm0, %ymm0");
123*4882a593Smuzhiyun asm volatile("vpxor %ymm13, %ymm8, %ymm8");
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
126*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32]));
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun bytes -= 64;
129*4882a593Smuzhiyun p += 64;
130*4882a593Smuzhiyun q += 64;
131*4882a593Smuzhiyun dp += 64;
132*4882a593Smuzhiyun dq += 64;
133*4882a593Smuzhiyun #else
134*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q));
135*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p));
136*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq));
137*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp));
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun /* 1 = dq ^ q; 0 = dp ^ p */
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
142*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun /*
145*4882a593Smuzhiyun * 1 = dq ^ q
146*4882a593Smuzhiyun * 3 = dq ^ p >> 4
147*4882a593Smuzhiyun */
148*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm1, %ymm3");
149*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm1, %ymm1");
150*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm3, %ymm3");
151*4882a593Smuzhiyun asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
152*4882a593Smuzhiyun asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
153*4882a593Smuzhiyun asm volatile("vpxor %ymm4, %ymm5, %ymm5");
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun /* 5 = qx */
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
158*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm0, %ymm2");
161*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm0, %ymm3");
162*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm2, %ymm2");
163*4882a593Smuzhiyun asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
164*4882a593Smuzhiyun asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
165*4882a593Smuzhiyun asm volatile("vpxor %ymm4, %ymm1, %ymm1");
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun /* 1 = pbmul[px] */
168*4882a593Smuzhiyun asm volatile("vpxor %ymm5, %ymm1, %ymm1");
169*4882a593Smuzhiyun /* 1 = db = DQ */
170*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun asm volatile("vpxor %ymm1, %ymm0, %ymm0");
173*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun bytes -= 32;
176*4882a593Smuzhiyun p += 32;
177*4882a593Smuzhiyun q += 32;
178*4882a593Smuzhiyun dp += 32;
179*4882a593Smuzhiyun dq += 32;
180*4882a593Smuzhiyun #endif
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun kernel_fpu_end();
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun
raid6_datap_recov_avx2(int disks,size_t bytes,int faila,void ** ptrs)186*4882a593Smuzhiyun static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
187*4882a593Smuzhiyun void **ptrs)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun u8 *p, *q, *dq;
190*4882a593Smuzhiyun const u8 *qmul; /* Q multiplier table */
191*4882a593Smuzhiyun const u8 x0f = 0x0f;
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun p = (u8 *)ptrs[disks-2];
194*4882a593Smuzhiyun q = (u8 *)ptrs[disks-1];
195*4882a593Smuzhiyun
196*4882a593Smuzhiyun /* Compute syndrome with zero for the missing data page
197*4882a593Smuzhiyun Use the dead data page as temporary storage for delta q */
198*4882a593Smuzhiyun dq = (u8 *)ptrs[faila];
199*4882a593Smuzhiyun ptrs[faila] = (void *)raid6_empty_zero_page;
200*4882a593Smuzhiyun ptrs[disks-1] = dq;
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun raid6_call.gen_syndrome(disks, bytes, ptrs);
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /* Restore pointer table */
205*4882a593Smuzhiyun ptrs[faila] = dq;
206*4882a593Smuzhiyun ptrs[disks-1] = q;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun /* Now, pick the proper data tables */
209*4882a593Smuzhiyun qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun kernel_fpu_begin();
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun while (bytes) {
216*4882a593Smuzhiyun #ifdef CONFIG_X86_64
217*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
218*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32]));
219*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
220*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32]));
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun /*
223*4882a593Smuzhiyun * 3 = q[0] ^ dq[0]
224*4882a593Smuzhiyun * 8 = q[32] ^ dq[32]
225*4882a593Smuzhiyun */
226*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
227*4882a593Smuzhiyun asm volatile("vmovapd %ymm0, %ymm13");
228*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
229*4882a593Smuzhiyun asm volatile("vmovapd %ymm1, %ymm14");
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm3, %ymm6");
232*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm8, %ymm12");
233*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm3, %ymm3");
234*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm8, %ymm8");
235*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm6, %ymm6");
236*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm12, %ymm12");
237*4882a593Smuzhiyun asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
238*4882a593Smuzhiyun asm volatile("vpshufb %ymm8, %ymm13, %ymm13");
239*4882a593Smuzhiyun asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
240*4882a593Smuzhiyun asm volatile("vpshufb %ymm12, %ymm14, %ymm14");
241*4882a593Smuzhiyun asm volatile("vpxor %ymm0, %ymm1, %ymm1");
242*4882a593Smuzhiyun asm volatile("vpxor %ymm13, %ymm14, %ymm14");
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun /*
245*4882a593Smuzhiyun * 1 = qmul[q[0] ^ dq[0]]
246*4882a593Smuzhiyun * 14 = qmul[q[32] ^ dq[32]]
247*4882a593Smuzhiyun */
248*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
249*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32]));
250*4882a593Smuzhiyun asm volatile("vpxor %ymm1, %ymm2, %ymm2");
251*4882a593Smuzhiyun asm volatile("vpxor %ymm14, %ymm12, %ymm12");
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun /*
254*4882a593Smuzhiyun * 2 = p[0] ^ qmul[q[0] ^ dq[0]]
255*4882a593Smuzhiyun * 12 = p[32] ^ qmul[q[32] ^ dq[32]]
256*4882a593Smuzhiyun */
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
259*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32]));
260*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
261*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32]));
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun bytes -= 64;
264*4882a593Smuzhiyun p += 64;
265*4882a593Smuzhiyun q += 64;
266*4882a593Smuzhiyun dq += 64;
267*4882a593Smuzhiyun #else
268*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
269*4882a593Smuzhiyun asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun /* 3 = q ^ dq */
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
274*4882a593Smuzhiyun asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun asm volatile("vpsraw $4, %ymm3, %ymm6");
277*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm3, %ymm3");
278*4882a593Smuzhiyun asm volatile("vpand %ymm7, %ymm6, %ymm6");
279*4882a593Smuzhiyun asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
280*4882a593Smuzhiyun asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
281*4882a593Smuzhiyun asm volatile("vpxor %ymm0, %ymm1, %ymm1");
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun /* 1 = qmul[q ^ dq] */
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
286*4882a593Smuzhiyun asm volatile("vpxor %ymm1, %ymm2, %ymm2");
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun /* 2 = p ^ qmul[q ^ dq] */
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
291*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun bytes -= 32;
294*4882a593Smuzhiyun p += 32;
295*4882a593Smuzhiyun q += 32;
296*4882a593Smuzhiyun dq += 32;
297*4882a593Smuzhiyun #endif
298*4882a593Smuzhiyun }
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun kernel_fpu_end();
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun const struct raid6_recov_calls raid6_recov_avx2 = {
304*4882a593Smuzhiyun .data2 = raid6_2data_recov_avx2,
305*4882a593Smuzhiyun .datap = raid6_datap_recov_avx2,
306*4882a593Smuzhiyun .valid = raid6_has_avx2,
307*4882a593Smuzhiyun #ifdef CONFIG_X86_64
308*4882a593Smuzhiyun .name = "avx2x2",
309*4882a593Smuzhiyun #else
310*4882a593Smuzhiyun .name = "avx2x1",
311*4882a593Smuzhiyun #endif
312*4882a593Smuzhiyun .priority = 2,
313*4882a593Smuzhiyun };
314