1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /* -*- linux-c -*- ------------------------------------------------------- *
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 2012 Intel Corporation
5*4882a593Smuzhiyun * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * ----------------------------------------------------------------------- */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun /*
12*4882a593Smuzhiyun * AVX2 implementation of RAID-6 syndrome functions
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun */
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun #include <linux/raid/pq.h>
17*4882a593Smuzhiyun #include "x86.h"
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun static const struct raid6_avx2_constants {
20*4882a593Smuzhiyun u64 x1d[4];
21*4882a593Smuzhiyun } raid6_avx2_constants __aligned(32) = {
22*4882a593Smuzhiyun { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
23*4882a593Smuzhiyun 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
24*4882a593Smuzhiyun };
25*4882a593Smuzhiyun
raid6_have_avx2(void)26*4882a593Smuzhiyun static int raid6_have_avx2(void)
27*4882a593Smuzhiyun {
28*4882a593Smuzhiyun return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
29*4882a593Smuzhiyun }
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun /*
32*4882a593Smuzhiyun * Plain AVX2 implementation
33*4882a593Smuzhiyun */
raid6_avx21_gen_syndrome(int disks,size_t bytes,void ** ptrs)34*4882a593Smuzhiyun static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
35*4882a593Smuzhiyun {
36*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
37*4882a593Smuzhiyun u8 *p, *q;
38*4882a593Smuzhiyun int d, z, z0;
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun z0 = disks - 3; /* Highest data disk */
41*4882a593Smuzhiyun p = dptr[z0+1]; /* XOR parity */
42*4882a593Smuzhiyun q = dptr[z0+2]; /* RS syndrome */
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun kernel_fpu_begin();
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
47*4882a593Smuzhiyun asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun for (d = 0; d < bytes; d += 32) {
50*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
51*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
52*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
53*4882a593Smuzhiyun asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
54*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
55*4882a593Smuzhiyun for (z = z0-2; z >= 0; z--) {
56*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
57*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
58*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
59*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
60*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
61*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm2,%ymm2");
62*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm4,%ymm4");
63*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
64*4882a593Smuzhiyun }
65*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
66*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
67*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
68*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
69*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm2,%ymm2");
70*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm4,%ymm4");
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
73*4882a593Smuzhiyun asm volatile("vpxor %ymm2,%ymm2,%ymm2");
74*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
75*4882a593Smuzhiyun asm volatile("vpxor %ymm4,%ymm4,%ymm4");
76*4882a593Smuzhiyun }
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
79*4882a593Smuzhiyun kernel_fpu_end();
80*4882a593Smuzhiyun }
81*4882a593Smuzhiyun
raid6_avx21_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)82*4882a593Smuzhiyun static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
83*4882a593Smuzhiyun size_t bytes, void **ptrs)
84*4882a593Smuzhiyun {
85*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
86*4882a593Smuzhiyun u8 *p, *q;
87*4882a593Smuzhiyun int d, z, z0;
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun z0 = stop; /* P/Q right side optimization */
90*4882a593Smuzhiyun p = dptr[disks-2]; /* XOR parity */
91*4882a593Smuzhiyun q = dptr[disks-1]; /* RS syndrome */
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun kernel_fpu_begin();
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun for (d = 0 ; d < bytes ; d += 32) {
98*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
99*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100*4882a593Smuzhiyun asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101*4882a593Smuzhiyun /* P/Q data pages */
102*4882a593Smuzhiyun for (z = z0-1 ; z >= start ; z--) {
103*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
107*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111*4882a593Smuzhiyun }
112*4882a593Smuzhiyun /* P/Q left side optimization */
113*4882a593Smuzhiyun for (z = start-1 ; z >= 0 ; z--) {
114*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
118*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119*4882a593Smuzhiyun }
120*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121*4882a593Smuzhiyun /* Don't use movntdq for r/w memory area < cache line */
122*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124*4882a593Smuzhiyun }
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
127*4882a593Smuzhiyun kernel_fpu_end();
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun const struct raid6_calls raid6_avx2x1 = {
131*4882a593Smuzhiyun raid6_avx21_gen_syndrome,
132*4882a593Smuzhiyun raid6_avx21_xor_syndrome,
133*4882a593Smuzhiyun raid6_have_avx2,
134*4882a593Smuzhiyun "avx2x1",
135*4882a593Smuzhiyun 1 /* Has cache hints */
136*4882a593Smuzhiyun };
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun /*
139*4882a593Smuzhiyun * Unrolled-by-2 AVX2 implementation
140*4882a593Smuzhiyun */
raid6_avx22_gen_syndrome(int disks,size_t bytes,void ** ptrs)141*4882a593Smuzhiyun static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
144*4882a593Smuzhiyun u8 *p, *q;
145*4882a593Smuzhiyun int d, z, z0;
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun z0 = disks - 3; /* Highest data disk */
148*4882a593Smuzhiyun p = dptr[z0+1]; /* XOR parity */
149*4882a593Smuzhiyun q = dptr[z0+2]; /* RS syndrome */
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun kernel_fpu_begin();
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
154*4882a593Smuzhiyun asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun /* We uniformly assume a single prefetch covers at least 32 bytes */
157*4882a593Smuzhiyun for (d = 0; d < bytes; d += 64) {
158*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
159*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
160*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
161*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
162*4882a593Smuzhiyun asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
163*4882a593Smuzhiyun asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
164*4882a593Smuzhiyun for (z = z0-1; z >= 0; z--) {
165*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
166*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
167*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
168*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
169*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
170*4882a593Smuzhiyun asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
171*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
172*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm7,%ymm7");
173*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
176*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
177*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm2,%ymm2");
178*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm3,%ymm3");
179*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
180*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
183*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
184*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
185*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
186*4882a593Smuzhiyun }
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
189*4882a593Smuzhiyun kernel_fpu_end();
190*4882a593Smuzhiyun }
191*4882a593Smuzhiyun
raid6_avx22_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)192*4882a593Smuzhiyun static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193*4882a593Smuzhiyun size_t bytes, void **ptrs)
194*4882a593Smuzhiyun {
195*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
196*4882a593Smuzhiyun u8 *p, *q;
197*4882a593Smuzhiyun int d, z, z0;
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun z0 = stop; /* P/Q right side optimization */
200*4882a593Smuzhiyun p = dptr[disks-2]; /* XOR parity */
201*4882a593Smuzhiyun q = dptr[disks-1]; /* RS syndrome */
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun kernel_fpu_begin();
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun for (d = 0 ; d < bytes ; d += 64) {
208*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212*4882a593Smuzhiyun asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214*4882a593Smuzhiyun /* P/Q data pages */
215*4882a593Smuzhiyun for (z = z0-1 ; z >= start ; z--) {
216*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221*4882a593Smuzhiyun asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
223*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm7,%ymm7");
224*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm7"
228*4882a593Smuzhiyun :: "m" (dptr[z][d+32]));
229*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233*4882a593Smuzhiyun }
234*4882a593Smuzhiyun /* P/Q left side optimization */
235*4882a593Smuzhiyun for (z = start-1 ; z >= 0 ; z--) {
236*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241*4882a593Smuzhiyun asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
243*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm7,%ymm7");
244*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246*4882a593Smuzhiyun }
247*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249*4882a593Smuzhiyun /* Don't use movntdq for r/w memory area < cache line */
250*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253*4882a593Smuzhiyun asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
257*4882a593Smuzhiyun kernel_fpu_end();
258*4882a593Smuzhiyun }
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun const struct raid6_calls raid6_avx2x2 = {
261*4882a593Smuzhiyun raid6_avx22_gen_syndrome,
262*4882a593Smuzhiyun raid6_avx22_xor_syndrome,
263*4882a593Smuzhiyun raid6_have_avx2,
264*4882a593Smuzhiyun "avx2x2",
265*4882a593Smuzhiyun 1 /* Has cache hints */
266*4882a593Smuzhiyun };
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun #ifdef CONFIG_X86_64
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun /*
271*4882a593Smuzhiyun * Unrolled-by-4 AVX2 implementation
272*4882a593Smuzhiyun */
raid6_avx24_gen_syndrome(int disks,size_t bytes,void ** ptrs)273*4882a593Smuzhiyun static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
274*4882a593Smuzhiyun {
275*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
276*4882a593Smuzhiyun u8 *p, *q;
277*4882a593Smuzhiyun int d, z, z0;
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun z0 = disks - 3; /* Highest data disk */
280*4882a593Smuzhiyun p = dptr[z0+1]; /* XOR parity */
281*4882a593Smuzhiyun q = dptr[z0+2]; /* RS syndrome */
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun kernel_fpu_begin();
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
286*4882a593Smuzhiyun asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
287*4882a593Smuzhiyun asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */
288*4882a593Smuzhiyun asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */
289*4882a593Smuzhiyun asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */
290*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */
291*4882a593Smuzhiyun asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */
292*4882a593Smuzhiyun asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */
293*4882a593Smuzhiyun asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */
294*4882a593Smuzhiyun asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun for (d = 0; d < bytes; d += 128) {
297*4882a593Smuzhiyun for (z = z0; z >= 0; z--) {
298*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
299*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
300*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
301*4882a593Smuzhiyun asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
302*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
303*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
304*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
305*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
306*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
307*4882a593Smuzhiyun asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
308*4882a593Smuzhiyun asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
309*4882a593Smuzhiyun asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
310*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
311*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm7,%ymm7");
312*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm13,%ymm13");
313*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm15,%ymm15");
314*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
315*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
316*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm12,%ymm12");
317*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm14,%ymm14");
318*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
319*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
320*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
321*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
322*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm2,%ymm2");
323*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm3,%ymm3");
324*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm10,%ymm10");
325*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm11,%ymm11");
326*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
327*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
328*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm12,%ymm12");
329*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm14,%ymm14");
330*4882a593Smuzhiyun }
331*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
332*4882a593Smuzhiyun asm volatile("vpxor %ymm2,%ymm2,%ymm2");
333*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
334*4882a593Smuzhiyun asm volatile("vpxor %ymm3,%ymm3,%ymm3");
335*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
336*4882a593Smuzhiyun asm volatile("vpxor %ymm10,%ymm10,%ymm10");
337*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
338*4882a593Smuzhiyun asm volatile("vpxor %ymm11,%ymm11,%ymm11");
339*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
340*4882a593Smuzhiyun asm volatile("vpxor %ymm4,%ymm4,%ymm4");
341*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
342*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm6,%ymm6");
343*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
344*4882a593Smuzhiyun asm volatile("vpxor %ymm12,%ymm12,%ymm12");
345*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
346*4882a593Smuzhiyun asm volatile("vpxor %ymm14,%ymm14,%ymm14");
347*4882a593Smuzhiyun }
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
350*4882a593Smuzhiyun kernel_fpu_end();
351*4882a593Smuzhiyun }
352*4882a593Smuzhiyun
raid6_avx24_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)353*4882a593Smuzhiyun static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354*4882a593Smuzhiyun size_t bytes, void **ptrs)
355*4882a593Smuzhiyun {
356*4882a593Smuzhiyun u8 **dptr = (u8 **)ptrs;
357*4882a593Smuzhiyun u8 *p, *q;
358*4882a593Smuzhiyun int d, z, z0;
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun z0 = stop; /* P/Q right side optimization */
361*4882a593Smuzhiyun p = dptr[disks-2]; /* XOR parity */
362*4882a593Smuzhiyun q = dptr[disks-1]; /* RS syndrome */
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun kernel_fpu_begin();
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun for (d = 0 ; d < bytes ; d += 128) {
369*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377*4882a593Smuzhiyun asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378*4882a593Smuzhiyun asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379*4882a593Smuzhiyun asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380*4882a593Smuzhiyun asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381*4882a593Smuzhiyun /* P/Q data pages */
382*4882a593Smuzhiyun for (z = z0-1 ; z >= start ; z--) {
383*4882a593Smuzhiyun asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384*4882a593Smuzhiyun asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394*4882a593Smuzhiyun asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395*4882a593Smuzhiyun asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396*4882a593Smuzhiyun asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
398*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm7,%ymm7");
399*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm13,%ymm13");
400*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm15,%ymm15");
401*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm7"
407*4882a593Smuzhiyun :: "m" (dptr[z][d+32]));
408*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm13"
409*4882a593Smuzhiyun :: "m" (dptr[z][d+64]));
410*4882a593Smuzhiyun asm volatile("vmovdqa %0,%%ymm15"
411*4882a593Smuzhiyun :: "m" (dptr[z][d+96]));
412*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun asm volatile("prefetchnta %0" :: "m" (q[d]));
422*4882a593Smuzhiyun asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423*4882a593Smuzhiyun /* P/Q left side optimization */
424*4882a593Smuzhiyun for (z = start-1 ; z >= 0 ; z--) {
425*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432*4882a593Smuzhiyun asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433*4882a593Smuzhiyun asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434*4882a593Smuzhiyun asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435*4882a593Smuzhiyun asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436*4882a593Smuzhiyun asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm5,%ymm5");
438*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm7,%ymm7");
439*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm13,%ymm13");
440*4882a593Smuzhiyun asm volatile("vpand %ymm0,%ymm15,%ymm15");
441*4882a593Smuzhiyun asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442*4882a593Smuzhiyun asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443*4882a593Smuzhiyun asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444*4882a593Smuzhiyun asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445*4882a593Smuzhiyun }
446*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453*4882a593Smuzhiyun asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457*4882a593Smuzhiyun asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458*4882a593Smuzhiyun }
459*4882a593Smuzhiyun asm volatile("sfence" : : : "memory");
460*4882a593Smuzhiyun kernel_fpu_end();
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun const struct raid6_calls raid6_avx2x4 = {
464*4882a593Smuzhiyun raid6_avx24_gen_syndrome,
465*4882a593Smuzhiyun raid6_avx24_xor_syndrome,
466*4882a593Smuzhiyun raid6_have_avx2,
467*4882a593Smuzhiyun "avx2x4",
468*4882a593Smuzhiyun 1 /* Has cache hints */
469*4882a593Smuzhiyun };
470*4882a593Smuzhiyun #endif
471