1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /* -*- linux-c -*- --------------------------------------------------------
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 2016 Intel Corporation
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Author: Gayatri Kammela <gayatri.kammela@intel.com>
7*4882a593Smuzhiyun * Author: Megha Dey <megha.dey@linux.intel.com>
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10*4882a593Smuzhiyun * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * -----------------------------------------------------------------------
13*4882a593Smuzhiyun */
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun /*
16*4882a593Smuzhiyun * AVX512 implementation of RAID-6 syndrome functions
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun */
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun #ifdef CONFIG_AS_AVX512
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun #include <linux/raid/pq.h>
23*4882a593Smuzhiyun #include "x86.h"
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun static const struct raid6_avx512_constants {
26*4882a593Smuzhiyun u64 x1d[8];
27*4882a593Smuzhiyun } raid6_avx512_constants __aligned(512/8) = {
28*4882a593Smuzhiyun { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29*4882a593Smuzhiyun 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
30*4882a593Smuzhiyun 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31*4882a593Smuzhiyun 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32*4882a593Smuzhiyun };
33*4882a593Smuzhiyun
raid6_have_avx512(void)34*4882a593Smuzhiyun static int raid6_have_avx512(void)
35*4882a593Smuzhiyun {
36*4882a593Smuzhiyun return boot_cpu_has(X86_FEATURE_AVX2) &&
37*4882a593Smuzhiyun boot_cpu_has(X86_FEATURE_AVX) &&
38*4882a593Smuzhiyun boot_cpu_has(X86_FEATURE_AVX512F) &&
39*4882a593Smuzhiyun boot_cpu_has(X86_FEATURE_AVX512BW) &&
40*4882a593Smuzhiyun boot_cpu_has(X86_FEATURE_AVX512VL) &&
41*4882a593Smuzhiyun boot_cpu_has(X86_FEATURE_AVX512DQ);
42*4882a593Smuzhiyun }
43*4882a593Smuzhiyun
raid6_avx5121_gen_syndrome(int disks,size_t bytes,void ** ptrs)44*4882a593Smuzhiyun static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
45*4882a593Smuzhiyun {
46*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
47*4882a593Smuzhiyun u8 *p, *q;
48*4882a593Smuzhiyun int d, z, z0;
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun z0 = disks - 3; /* Highest data disk */
51*4882a593Smuzhiyun p = dptr[z0+1]; /* XOR parity */
52*4882a593Smuzhiyun q = dptr[z0+2]; /* RS syndrome */
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun kernel_fpu_begin();
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm0\n\t"
57*4882a593Smuzhiyun "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
58*4882a593Smuzhiyun :
59*4882a593Smuzhiyun : "m" (raid6_avx512_constants.x1d[0]));
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun for (d = 0; d < bytes; d += 64) {
62*4882a593Smuzhiyun asm volatile("prefetchnta %0\n\t"
63*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
64*4882a593Smuzhiyun "prefetchnta %1\n\t"
65*4882a593Smuzhiyun "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
66*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm6"
67*4882a593Smuzhiyun :
68*4882a593Smuzhiyun : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
69*4882a593Smuzhiyun for (z = z0-2; z >= 0; z--) {
70*4882a593Smuzhiyun asm volatile("prefetchnta %0\n\t"
71*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
72*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
73*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
74*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
75*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
76*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
77*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
78*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm6"
79*4882a593Smuzhiyun :
80*4882a593Smuzhiyun : "m" (dptr[z][d]));
81*4882a593Smuzhiyun }
82*4882a593Smuzhiyun asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
83*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
84*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
85*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
86*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
87*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
88*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
89*4882a593Smuzhiyun "vmovntdq %%zmm2,%0\n\t"
90*4882a593Smuzhiyun "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
91*4882a593Smuzhiyun "vmovntdq %%zmm4,%1\n\t"
92*4882a593Smuzhiyun "vpxorq %%zmm4,%%zmm4,%%zmm4"
93*4882a593Smuzhiyun :
94*4882a593Smuzhiyun : "m" (p[d]), "m" (q[d]));
95*4882a593Smuzhiyun }
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
98*4882a593Smuzhiyun kernel_fpu_end();
99*4882a593Smuzhiyun }
100*4882a593Smuzhiyun
raid6_avx5121_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)101*4882a593Smuzhiyun static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
102*4882a593Smuzhiyun size_t bytes, void **ptrs)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
105*4882a593Smuzhiyun u8 *p, *q;
106*4882a593Smuzhiyun int d, z, z0;
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun z0 = stop; /* P/Q right side optimization */
109*4882a593Smuzhiyun p = dptr[disks-2]; /* XOR parity */
110*4882a593Smuzhiyun q = dptr[disks-1]; /* RS syndrome */
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun kernel_fpu_begin();
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm0"
115*4882a593Smuzhiyun : : "m" (raid6_avx512_constants.x1d[0]));
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun for (d = 0 ; d < bytes ; d += 64) {
118*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm4\n\t"
119*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm2\n\t"
120*4882a593Smuzhiyun "vpxorq %%zmm4,%%zmm2,%%zmm2"
121*4882a593Smuzhiyun :
122*4882a593Smuzhiyun : "m" (dptr[z0][d]), "m" (p[d]));
123*4882a593Smuzhiyun /* P/Q data pages */
124*4882a593Smuzhiyun for (z = z0-1 ; z >= start ; z--) {
125*4882a593Smuzhiyun asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
126*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
127*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
128*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
129*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
130*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
131*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm5\n\t"
132*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
133*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4"
134*4882a593Smuzhiyun :
135*4882a593Smuzhiyun : "m" (dptr[z][d]));
136*4882a593Smuzhiyun }
137*4882a593Smuzhiyun /* P/Q left side optimization */
138*4882a593Smuzhiyun for (z = start-1 ; z >= 0 ; z--) {
139*4882a593Smuzhiyun asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
140*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
141*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
142*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
143*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
144*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4"
145*4882a593Smuzhiyun :
146*4882a593Smuzhiyun : );
147*4882a593Smuzhiyun }
148*4882a593Smuzhiyun asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
149*4882a593Smuzhiyun /* Don't use movntdq for r/w memory area < cache line */
150*4882a593Smuzhiyun "vmovdqa64 %%zmm4,%0\n\t"
151*4882a593Smuzhiyun "vmovdqa64 %%zmm2,%1"
152*4882a593Smuzhiyun :
153*4882a593Smuzhiyun : "m" (q[d]), "m" (p[d]));
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
157*4882a593Smuzhiyun kernel_fpu_end();
158*4882a593Smuzhiyun }
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun const struct raid6_calls raid6_avx512x1 = {
161*4882a593Smuzhiyun raid6_avx5121_gen_syndrome,
162*4882a593Smuzhiyun raid6_avx5121_xor_syndrome,
163*4882a593Smuzhiyun raid6_have_avx512,
164*4882a593Smuzhiyun "avx512x1",
165*4882a593Smuzhiyun 1 /* Has cache hints */
166*4882a593Smuzhiyun };
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun /*
169*4882a593Smuzhiyun * Unrolled-by-2 AVX512 implementation
170*4882a593Smuzhiyun */
raid6_avx5122_gen_syndrome(int disks,size_t bytes,void ** ptrs)171*4882a593Smuzhiyun static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
172*4882a593Smuzhiyun {
173*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
174*4882a593Smuzhiyun u8 *p, *q;
175*4882a593Smuzhiyun int d, z, z0;
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun z0 = disks - 3; /* Highest data disk */
178*4882a593Smuzhiyun p = dptr[z0+1]; /* XOR parity */
179*4882a593Smuzhiyun q = dptr[z0+2]; /* RS syndrome */
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun kernel_fpu_begin();
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm0\n\t"
184*4882a593Smuzhiyun "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
185*4882a593Smuzhiyun :
186*4882a593Smuzhiyun : "m" (raid6_avx512_constants.x1d[0]));
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun /* We uniformly assume a single prefetch covers at least 64 bytes */
189*4882a593Smuzhiyun for (d = 0; d < bytes; d += 128) {
190*4882a593Smuzhiyun asm volatile("prefetchnta %0\n\t"
191*4882a593Smuzhiyun "prefetchnta %1\n\t"
192*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
193*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
194*4882a593Smuzhiyun "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
195*4882a593Smuzhiyun "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
196*4882a593Smuzhiyun :
197*4882a593Smuzhiyun : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
198*4882a593Smuzhiyun for (z = z0-1; z >= 0; z--) {
199*4882a593Smuzhiyun asm volatile("prefetchnta %0\n\t"
200*4882a593Smuzhiyun "prefetchnta %1\n\t"
201*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
202*4882a593Smuzhiyun "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
203*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
204*4882a593Smuzhiyun "vpmovm2b %%k2,%%zmm7\n\t"
205*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
206*4882a593Smuzhiyun "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
207*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
208*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
209*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
210*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
211*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm5\n\t"
212*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm7\n\t"
213*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
214*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
215*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
216*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6"
217*4882a593Smuzhiyun :
218*4882a593Smuzhiyun : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
219*4882a593Smuzhiyun }
220*4882a593Smuzhiyun asm volatile("vmovntdq %%zmm2,%0\n\t"
221*4882a593Smuzhiyun "vmovntdq %%zmm3,%1\n\t"
222*4882a593Smuzhiyun "vmovntdq %%zmm4,%2\n\t"
223*4882a593Smuzhiyun "vmovntdq %%zmm6,%3"
224*4882a593Smuzhiyun :
225*4882a593Smuzhiyun : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
226*4882a593Smuzhiyun "m" (q[d+64]));
227*4882a593Smuzhiyun }
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
230*4882a593Smuzhiyun kernel_fpu_end();
231*4882a593Smuzhiyun }
232*4882a593Smuzhiyun
raid6_avx5122_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)233*4882a593Smuzhiyun static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
234*4882a593Smuzhiyun size_t bytes, void **ptrs)
235*4882a593Smuzhiyun {
236*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
237*4882a593Smuzhiyun u8 *p, *q;
238*4882a593Smuzhiyun int d, z, z0;
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun z0 = stop; /* P/Q right side optimization */
241*4882a593Smuzhiyun p = dptr[disks-2]; /* XOR parity */
242*4882a593Smuzhiyun q = dptr[disks-1]; /* RS syndrome */
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun kernel_fpu_begin();
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm0"
247*4882a593Smuzhiyun : : "m" (raid6_avx512_constants.x1d[0]));
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun for (d = 0 ; d < bytes ; d += 128) {
250*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm4\n\t"
251*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm6\n\t"
252*4882a593Smuzhiyun "vmovdqa64 %2,%%zmm2\n\t"
253*4882a593Smuzhiyun "vmovdqa64 %3,%%zmm3\n\t"
254*4882a593Smuzhiyun "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
255*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm3,%%zmm3"
256*4882a593Smuzhiyun :
257*4882a593Smuzhiyun : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
258*4882a593Smuzhiyun "m" (p[d]), "m" (p[d+64]));
259*4882a593Smuzhiyun /* P/Q data pages */
260*4882a593Smuzhiyun for (z = z0-1 ; z >= start ; z--) {
261*4882a593Smuzhiyun asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
262*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
263*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
264*4882a593Smuzhiyun "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
265*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
266*4882a593Smuzhiyun "vpmovm2b %%k2,%%zmm7\n\t"
267*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
268*4882a593Smuzhiyun "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
269*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
270*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
271*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
272*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
273*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm5\n\t"
274*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm7\n\t"
275*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
276*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
277*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
278*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6"
279*4882a593Smuzhiyun :
280*4882a593Smuzhiyun : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun /* P/Q left side optimization */
283*4882a593Smuzhiyun for (z = start-1 ; z >= 0 ; z--) {
284*4882a593Smuzhiyun asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
285*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
286*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
287*4882a593Smuzhiyun "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
288*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
289*4882a593Smuzhiyun "vpmovm2b %%k2,%%zmm7\n\t"
290*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
291*4882a593Smuzhiyun "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
292*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
293*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
294*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
295*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6"
296*4882a593Smuzhiyun :
297*4882a593Smuzhiyun : );
298*4882a593Smuzhiyun }
299*4882a593Smuzhiyun asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
300*4882a593Smuzhiyun "vpxorq %1,%%zmm6,%%zmm6\n\t"
301*4882a593Smuzhiyun /* Don't use movntdq for r/w
302*4882a593Smuzhiyun * memory area < cache line
303*4882a593Smuzhiyun */
304*4882a593Smuzhiyun "vmovdqa64 %%zmm4,%0\n\t"
305*4882a593Smuzhiyun "vmovdqa64 %%zmm6,%1\n\t"
306*4882a593Smuzhiyun "vmovdqa64 %%zmm2,%2\n\t"
307*4882a593Smuzhiyun "vmovdqa64 %%zmm3,%3"
308*4882a593Smuzhiyun :
309*4882a593Smuzhiyun : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
310*4882a593Smuzhiyun "m" (p[d+64]));
311*4882a593Smuzhiyun }
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
314*4882a593Smuzhiyun kernel_fpu_end();
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun const struct raid6_calls raid6_avx512x2 = {
318*4882a593Smuzhiyun raid6_avx5122_gen_syndrome,
319*4882a593Smuzhiyun raid6_avx5122_xor_syndrome,
320*4882a593Smuzhiyun raid6_have_avx512,
321*4882a593Smuzhiyun "avx512x2",
322*4882a593Smuzhiyun 1 /* Has cache hints */
323*4882a593Smuzhiyun };
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun #ifdef CONFIG_X86_64
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun /*
328*4882a593Smuzhiyun * Unrolled-by-4 AVX2 implementation
329*4882a593Smuzhiyun */
raid6_avx5124_gen_syndrome(int disks,size_t bytes,void ** ptrs)330*4882a593Smuzhiyun static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
331*4882a593Smuzhiyun {
332*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
333*4882a593Smuzhiyun u8 *p, *q;
334*4882a593Smuzhiyun int d, z, z0;
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun z0 = disks - 3; /* Highest data disk */
337*4882a593Smuzhiyun p = dptr[z0+1]; /* XOR parity */
338*4882a593Smuzhiyun q = dptr[z0+2]; /* RS syndrome */
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun kernel_fpu_begin();
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm0\n\t"
343*4882a593Smuzhiyun "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
344*4882a593Smuzhiyun "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
345*4882a593Smuzhiyun "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
346*4882a593Smuzhiyun "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
347*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
348*4882a593Smuzhiyun "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
349*4882a593Smuzhiyun "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
350*4882a593Smuzhiyun "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
351*4882a593Smuzhiyun "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
352*4882a593Smuzhiyun :
353*4882a593Smuzhiyun : "m" (raid6_avx512_constants.x1d[0]));
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun for (d = 0; d < bytes; d += 256) {
356*4882a593Smuzhiyun for (z = z0; z >= 0; z--) {
357*4882a593Smuzhiyun asm volatile("prefetchnta %0\n\t"
358*4882a593Smuzhiyun "prefetchnta %1\n\t"
359*4882a593Smuzhiyun "prefetchnta %2\n\t"
360*4882a593Smuzhiyun "prefetchnta %3\n\t"
361*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
362*4882a593Smuzhiyun "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
363*4882a593Smuzhiyun "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
364*4882a593Smuzhiyun "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
365*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
366*4882a593Smuzhiyun "vpmovm2b %%k2,%%zmm7\n\t"
367*4882a593Smuzhiyun "vpmovm2b %%k3,%%zmm13\n\t"
368*4882a593Smuzhiyun "vpmovm2b %%k4,%%zmm15\n\t"
369*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
370*4882a593Smuzhiyun "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
371*4882a593Smuzhiyun "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
372*4882a593Smuzhiyun "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
373*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
374*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
375*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
376*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
377*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
378*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
379*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
380*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
381*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm5\n\t"
382*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm7\n\t"
383*4882a593Smuzhiyun "vmovdqa64 %2,%%zmm13\n\t"
384*4882a593Smuzhiyun "vmovdqa64 %3,%%zmm15\n\t"
385*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
386*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
387*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
388*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
389*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
390*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
391*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
392*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm14,%%zmm14"
393*4882a593Smuzhiyun :
394*4882a593Smuzhiyun : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
395*4882a593Smuzhiyun "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
396*4882a593Smuzhiyun }
397*4882a593Smuzhiyun asm volatile("vmovntdq %%zmm2,%0\n\t"
398*4882a593Smuzhiyun "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
399*4882a593Smuzhiyun "vmovntdq %%zmm3,%1\n\t"
400*4882a593Smuzhiyun "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
401*4882a593Smuzhiyun "vmovntdq %%zmm10,%2\n\t"
402*4882a593Smuzhiyun "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
403*4882a593Smuzhiyun "vmovntdq %%zmm11,%3\n\t"
404*4882a593Smuzhiyun "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
405*4882a593Smuzhiyun "vmovntdq %%zmm4,%4\n\t"
406*4882a593Smuzhiyun "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
407*4882a593Smuzhiyun "vmovntdq %%zmm6,%5\n\t"
408*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
409*4882a593Smuzhiyun "vmovntdq %%zmm12,%6\n\t"
410*4882a593Smuzhiyun "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
411*4882a593Smuzhiyun "vmovntdq %%zmm14,%7\n\t"
412*4882a593Smuzhiyun "vpxorq %%zmm14,%%zmm14,%%zmm14"
413*4882a593Smuzhiyun :
414*4882a593Smuzhiyun : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
415*4882a593Smuzhiyun "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
416*4882a593Smuzhiyun "m" (q[d+128]), "m" (q[d+192]));
417*4882a593Smuzhiyun }
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
420*4882a593Smuzhiyun kernel_fpu_end();
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun
raid6_avx5124_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)423*4882a593Smuzhiyun static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
424*4882a593Smuzhiyun size_t bytes, void **ptrs)
425*4882a593Smuzhiyun {
426*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
427*4882a593Smuzhiyun u8 *p, *q;
428*4882a593Smuzhiyun int d, z, z0;
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun z0 = stop; /* P/Q right side optimization */
431*4882a593Smuzhiyun p = dptr[disks-2]; /* XOR parity */
432*4882a593Smuzhiyun q = dptr[disks-1]; /* RS syndrome */
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun kernel_fpu_begin();
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm0"
437*4882a593Smuzhiyun :: "m" (raid6_avx512_constants.x1d[0]));
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun for (d = 0 ; d < bytes ; d += 256) {
440*4882a593Smuzhiyun asm volatile("vmovdqa64 %0,%%zmm4\n\t"
441*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm6\n\t"
442*4882a593Smuzhiyun "vmovdqa64 %2,%%zmm12\n\t"
443*4882a593Smuzhiyun "vmovdqa64 %3,%%zmm14\n\t"
444*4882a593Smuzhiyun "vmovdqa64 %4,%%zmm2\n\t"
445*4882a593Smuzhiyun "vmovdqa64 %5,%%zmm3\n\t"
446*4882a593Smuzhiyun "vmovdqa64 %6,%%zmm10\n\t"
447*4882a593Smuzhiyun "vmovdqa64 %7,%%zmm11\n\t"
448*4882a593Smuzhiyun "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
449*4882a593Smuzhiyun "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
450*4882a593Smuzhiyun "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
451*4882a593Smuzhiyun "vpxorq %%zmm14,%%zmm11,%%zmm11"
452*4882a593Smuzhiyun :
453*4882a593Smuzhiyun : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
454*4882a593Smuzhiyun "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
455*4882a593Smuzhiyun "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
456*4882a593Smuzhiyun "m" (p[d+192]));
457*4882a593Smuzhiyun /* P/Q data pages */
458*4882a593Smuzhiyun for (z = z0-1 ; z >= start ; z--) {
459*4882a593Smuzhiyun asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
460*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
461*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
462*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
463*4882a593Smuzhiyun "prefetchnta %0\n\t"
464*4882a593Smuzhiyun "prefetchnta %2\n\t"
465*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
466*4882a593Smuzhiyun "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
467*4882a593Smuzhiyun "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
468*4882a593Smuzhiyun "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
469*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
470*4882a593Smuzhiyun "vpmovm2b %%k2,%%zmm7\n\t"
471*4882a593Smuzhiyun "vpmovm2b %%k3,%%zmm13\n\t"
472*4882a593Smuzhiyun "vpmovm2b %%k4,%%zmm15\n\t"
473*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
474*4882a593Smuzhiyun "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
475*4882a593Smuzhiyun "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
476*4882a593Smuzhiyun "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
477*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
478*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
479*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
480*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
481*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
482*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
483*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
484*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
485*4882a593Smuzhiyun "vmovdqa64 %0,%%zmm5\n\t"
486*4882a593Smuzhiyun "vmovdqa64 %1,%%zmm7\n\t"
487*4882a593Smuzhiyun "vmovdqa64 %2,%%zmm13\n\t"
488*4882a593Smuzhiyun "vmovdqa64 %3,%%zmm15\n\t"
489*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
490*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
491*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
492*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
493*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
494*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
495*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
496*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm14,%%zmm14"
497*4882a593Smuzhiyun :
498*4882a593Smuzhiyun : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
499*4882a593Smuzhiyun "m" (dptr[z][d+128]),
500*4882a593Smuzhiyun "m" (dptr[z][d+192]));
501*4882a593Smuzhiyun }
502*4882a593Smuzhiyun asm volatile("prefetchnta %0\n\t"
503*4882a593Smuzhiyun "prefetchnta %1\n\t"
504*4882a593Smuzhiyun :
505*4882a593Smuzhiyun : "m" (q[d]), "m" (q[d+128]));
506*4882a593Smuzhiyun /* P/Q left side optimization */
507*4882a593Smuzhiyun for (z = start-1 ; z >= 0 ; z--) {
508*4882a593Smuzhiyun asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
509*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
510*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
511*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
512*4882a593Smuzhiyun "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
513*4882a593Smuzhiyun "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
514*4882a593Smuzhiyun "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
515*4882a593Smuzhiyun "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
516*4882a593Smuzhiyun "vpmovm2b %%k1,%%zmm5\n\t"
517*4882a593Smuzhiyun "vpmovm2b %%k2,%%zmm7\n\t"
518*4882a593Smuzhiyun "vpmovm2b %%k3,%%zmm13\n\t"
519*4882a593Smuzhiyun "vpmovm2b %%k4,%%zmm15\n\t"
520*4882a593Smuzhiyun "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
521*4882a593Smuzhiyun "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
522*4882a593Smuzhiyun "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
523*4882a593Smuzhiyun "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
524*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
525*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
526*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
527*4882a593Smuzhiyun "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
528*4882a593Smuzhiyun "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
529*4882a593Smuzhiyun "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
530*4882a593Smuzhiyun "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
531*4882a593Smuzhiyun "vpxorq %%zmm15,%%zmm14,%%zmm14"
532*4882a593Smuzhiyun :
533*4882a593Smuzhiyun : );
534*4882a593Smuzhiyun }
535*4882a593Smuzhiyun asm volatile("vmovntdq %%zmm2,%0\n\t"
536*4882a593Smuzhiyun "vmovntdq %%zmm3,%1\n\t"
537*4882a593Smuzhiyun "vmovntdq %%zmm10,%2\n\t"
538*4882a593Smuzhiyun "vmovntdq %%zmm11,%3\n\t"
539*4882a593Smuzhiyun "vpxorq %4,%%zmm4,%%zmm4\n\t"
540*4882a593Smuzhiyun "vpxorq %5,%%zmm6,%%zmm6\n\t"
541*4882a593Smuzhiyun "vpxorq %6,%%zmm12,%%zmm12\n\t"
542*4882a593Smuzhiyun "vpxorq %7,%%zmm14,%%zmm14\n\t"
543*4882a593Smuzhiyun "vmovntdq %%zmm4,%4\n\t"
544*4882a593Smuzhiyun "vmovntdq %%zmm6,%5\n\t"
545*4882a593Smuzhiyun "vmovntdq %%zmm12,%6\n\t"
546*4882a593Smuzhiyun "vmovntdq %%zmm14,%7"
547*4882a593Smuzhiyun :
548*4882a593Smuzhiyun : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
549*4882a593Smuzhiyun "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
550*4882a593Smuzhiyun "m" (q[d+128]), "m" (q[d+192]));
551*4882a593Smuzhiyun }
552*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
553*4882a593Smuzhiyun kernel_fpu_end();
554*4882a593Smuzhiyun }
555*4882a593Smuzhiyun const struct raid6_calls raid6_avx512x4 = {
556*4882a593Smuzhiyun raid6_avx5124_gen_syndrome,
557*4882a593Smuzhiyun raid6_avx5124_xor_syndrome,
558*4882a593Smuzhiyun raid6_have_avx512,
559*4882a593Smuzhiyun "avx512x4",
560*4882a593Smuzhiyun 1 /* Has cache hints */
561*4882a593Smuzhiyun };
562*4882a593Smuzhiyun #endif
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun #endif /* CONFIG_AS_AVX512 */
565