xref: /OK3568_Linux_fs/kernel/lib/raid6/sse2.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /* -*- linux-c -*- ------------------------------------------------------- *
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
5*4882a593Smuzhiyun  *
6*4882a593Smuzhiyun  * ----------------------------------------------------------------------- */
7*4882a593Smuzhiyun 
8*4882a593Smuzhiyun /*
9*4882a593Smuzhiyun  * raid6/sse2.c
10*4882a593Smuzhiyun  *
11*4882a593Smuzhiyun  * SSE-2 implementation of RAID-6 syndrome functions
12*4882a593Smuzhiyun  *
13*4882a593Smuzhiyun  */
14*4882a593Smuzhiyun 
15*4882a593Smuzhiyun #include <linux/raid/pq.h>
16*4882a593Smuzhiyun #include "x86.h"
17*4882a593Smuzhiyun 
18*4882a593Smuzhiyun static const struct raid6_sse_constants {
19*4882a593Smuzhiyun 	u64 x1d[2];
20*4882a593Smuzhiyun } raid6_sse_constants  __attribute__((aligned(16))) = {
21*4882a593Smuzhiyun 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
22*4882a593Smuzhiyun };
23*4882a593Smuzhiyun 
raid6_have_sse2(void)24*4882a593Smuzhiyun static int raid6_have_sse2(void)
25*4882a593Smuzhiyun {
26*4882a593Smuzhiyun 	/* Not really boot_cpu but "all_cpus" */
27*4882a593Smuzhiyun 	return boot_cpu_has(X86_FEATURE_MMX) &&
28*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_FXSR) &&
29*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_XMM) &&
30*4882a593Smuzhiyun 		boot_cpu_has(X86_FEATURE_XMM2);
31*4882a593Smuzhiyun }
32*4882a593Smuzhiyun 
33*4882a593Smuzhiyun /*
34*4882a593Smuzhiyun  * Plain SSE2 implementation
35*4882a593Smuzhiyun  */
raid6_sse21_gen_syndrome(int disks,size_t bytes,void ** ptrs)36*4882a593Smuzhiyun static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
37*4882a593Smuzhiyun {
38*4882a593Smuzhiyun 	u8 **dptr = (u8 **)ptrs;
39*4882a593Smuzhiyun 	u8 *p, *q;
40*4882a593Smuzhiyun 	int d, z, z0;
41*4882a593Smuzhiyun 
42*4882a593Smuzhiyun 	z0 = disks - 3;		/* Highest data disk */
43*4882a593Smuzhiyun 	p = dptr[z0+1];		/* XOR parity */
44*4882a593Smuzhiyun 	q = dptr[z0+2];		/* RS syndrome */
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun 	kernel_fpu_begin();
47*4882a593Smuzhiyun 
48*4882a593Smuzhiyun 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
49*4882a593Smuzhiyun 	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
50*4882a593Smuzhiyun 
51*4882a593Smuzhiyun 	for ( d = 0 ; d < bytes ; d += 16 ) {
52*4882a593Smuzhiyun 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
53*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
54*4882a593Smuzhiyun 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
55*4882a593Smuzhiyun 		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
56*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
57*4882a593Smuzhiyun 		for ( z = z0-2 ; z >= 0 ; z-- ) {
58*4882a593Smuzhiyun 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
59*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
60*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
61*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
62*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
63*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
64*4882a593Smuzhiyun 			asm volatile("pxor %xmm6,%xmm2");
65*4882a593Smuzhiyun 			asm volatile("pxor %xmm6,%xmm4");
66*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
67*4882a593Smuzhiyun 		}
68*4882a593Smuzhiyun 		asm volatile("pcmpgtb %xmm4,%xmm5");
69*4882a593Smuzhiyun 		asm volatile("paddb %xmm4,%xmm4");
70*4882a593Smuzhiyun 		asm volatile("pand %xmm0,%xmm5");
71*4882a593Smuzhiyun 		asm volatile("pxor %xmm5,%xmm4");
72*4882a593Smuzhiyun 		asm volatile("pxor %xmm5,%xmm5");
73*4882a593Smuzhiyun 		asm volatile("pxor %xmm6,%xmm2");
74*4882a593Smuzhiyun 		asm volatile("pxor %xmm6,%xmm4");
75*4882a593Smuzhiyun 
76*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
77*4882a593Smuzhiyun 		asm volatile("pxor %xmm2,%xmm2");
78*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
79*4882a593Smuzhiyun 		asm volatile("pxor %xmm4,%xmm4");
80*4882a593Smuzhiyun 	}
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun 	asm volatile("sfence" : : : "memory");
83*4882a593Smuzhiyun 	kernel_fpu_end();
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun 
86*4882a593Smuzhiyun 
raid6_sse21_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)87*4882a593Smuzhiyun static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
88*4882a593Smuzhiyun 				     size_t bytes, void **ptrs)
89*4882a593Smuzhiyun {
90*4882a593Smuzhiyun 	u8 **dptr = (u8 **)ptrs;
91*4882a593Smuzhiyun 	u8 *p, *q;
92*4882a593Smuzhiyun 	int d, z, z0;
93*4882a593Smuzhiyun 
94*4882a593Smuzhiyun 	z0 = stop;		/* P/Q right side optimization */
95*4882a593Smuzhiyun 	p = dptr[disks-2];	/* XOR parity */
96*4882a593Smuzhiyun 	q = dptr[disks-1];	/* RS syndrome */
97*4882a593Smuzhiyun 
98*4882a593Smuzhiyun 	kernel_fpu_begin();
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
101*4882a593Smuzhiyun 
102*4882a593Smuzhiyun 	for ( d = 0 ; d < bytes ; d += 16 ) {
103*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
104*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
105*4882a593Smuzhiyun 		asm volatile("pxor %xmm4,%xmm2");
106*4882a593Smuzhiyun 		/* P/Q data pages */
107*4882a593Smuzhiyun 		for ( z = z0-1 ; z >= start ; z-- ) {
108*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
109*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
110*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
111*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
112*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
113*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
114*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm2");
115*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
116*4882a593Smuzhiyun 		}
117*4882a593Smuzhiyun 		/* P/Q left side optimization */
118*4882a593Smuzhiyun 		for ( z = start-1 ; z >= 0 ; z-- ) {
119*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
120*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
121*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
122*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
123*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
124*4882a593Smuzhiyun 		}
125*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
126*4882a593Smuzhiyun 		/* Don't use movntdq for r/w memory area < cache line */
127*4882a593Smuzhiyun 		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
128*4882a593Smuzhiyun 		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
129*4882a593Smuzhiyun 	}
130*4882a593Smuzhiyun 
131*4882a593Smuzhiyun 	asm volatile("sfence" : : : "memory");
132*4882a593Smuzhiyun 	kernel_fpu_end();
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun const struct raid6_calls raid6_sse2x1 = {
136*4882a593Smuzhiyun 	raid6_sse21_gen_syndrome,
137*4882a593Smuzhiyun 	raid6_sse21_xor_syndrome,
138*4882a593Smuzhiyun 	raid6_have_sse2,
139*4882a593Smuzhiyun 	"sse2x1",
140*4882a593Smuzhiyun 	1			/* Has cache hints */
141*4882a593Smuzhiyun };
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun /*
144*4882a593Smuzhiyun  * Unrolled-by-2 SSE2 implementation
145*4882a593Smuzhiyun  */
raid6_sse22_gen_syndrome(int disks,size_t bytes,void ** ptrs)146*4882a593Smuzhiyun static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
147*4882a593Smuzhiyun {
148*4882a593Smuzhiyun 	u8 **dptr = (u8 **)ptrs;
149*4882a593Smuzhiyun 	u8 *p, *q;
150*4882a593Smuzhiyun 	int d, z, z0;
151*4882a593Smuzhiyun 
152*4882a593Smuzhiyun 	z0 = disks - 3;		/* Highest data disk */
153*4882a593Smuzhiyun 	p = dptr[z0+1];		/* XOR parity */
154*4882a593Smuzhiyun 	q = dptr[z0+2];		/* RS syndrome */
155*4882a593Smuzhiyun 
156*4882a593Smuzhiyun 	kernel_fpu_begin();
157*4882a593Smuzhiyun 
158*4882a593Smuzhiyun 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
159*4882a593Smuzhiyun 	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
160*4882a593Smuzhiyun 	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun 	/* We uniformly assume a single prefetch covers at least 32 bytes */
163*4882a593Smuzhiyun 	for ( d = 0 ; d < bytes ; d += 32 ) {
164*4882a593Smuzhiyun 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
165*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
166*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
167*4882a593Smuzhiyun 		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
168*4882a593Smuzhiyun 		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
169*4882a593Smuzhiyun 		for ( z = z0-1 ; z >= 0 ; z-- ) {
170*4882a593Smuzhiyun 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
171*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
172*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm6,%xmm7");
173*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
174*4882a593Smuzhiyun 			asm volatile("paddb %xmm6,%xmm6");
175*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
176*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm7");
177*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
178*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
179*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
180*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
181*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm2");
182*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm3");
183*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
184*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
185*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
186*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm7");
187*4882a593Smuzhiyun 		}
188*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
189*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
190*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
191*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
192*4882a593Smuzhiyun 	}
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun 	asm volatile("sfence" : : : "memory");
195*4882a593Smuzhiyun 	kernel_fpu_end();
196*4882a593Smuzhiyun }
197*4882a593Smuzhiyun 
raid6_sse22_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)198*4882a593Smuzhiyun static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
199*4882a593Smuzhiyun 				     size_t bytes, void **ptrs)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun 	u8 **dptr = (u8 **)ptrs;
202*4882a593Smuzhiyun 	u8 *p, *q;
203*4882a593Smuzhiyun 	int d, z, z0;
204*4882a593Smuzhiyun 
205*4882a593Smuzhiyun 	z0 = stop;		/* P/Q right side optimization */
206*4882a593Smuzhiyun 	p = dptr[disks-2];	/* XOR parity */
207*4882a593Smuzhiyun 	q = dptr[disks-1];	/* RS syndrome */
208*4882a593Smuzhiyun 
209*4882a593Smuzhiyun 	kernel_fpu_begin();
210*4882a593Smuzhiyun 
211*4882a593Smuzhiyun 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	for ( d = 0 ; d < bytes ; d += 32 ) {
214*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
215*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
216*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
217*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
218*4882a593Smuzhiyun 		asm volatile("pxor %xmm4,%xmm2");
219*4882a593Smuzhiyun 		asm volatile("pxor %xmm6,%xmm3");
220*4882a593Smuzhiyun 		/* P/Q data pages */
221*4882a593Smuzhiyun 		for ( z = z0-1 ; z >= start ; z-- ) {
222*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
223*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm7");
224*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
225*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm6,%xmm7");
226*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
227*4882a593Smuzhiyun 			asm volatile("paddb %xmm6,%xmm6");
228*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
229*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm7");
230*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
231*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
232*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
233*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
234*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm2");
235*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm3");
236*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
237*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
238*4882a593Smuzhiyun 		}
239*4882a593Smuzhiyun 		/* P/Q left side optimization */
240*4882a593Smuzhiyun 		for ( z = start-1 ; z >= 0 ; z-- ) {
241*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
242*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm7");
243*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
244*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm6,%xmm7");
245*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
246*4882a593Smuzhiyun 			asm volatile("paddb %xmm6,%xmm6");
247*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
248*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm7");
249*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
250*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
251*4882a593Smuzhiyun 		}
252*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
253*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
254*4882a593Smuzhiyun 		/* Don't use movntdq for r/w memory area < cache line */
255*4882a593Smuzhiyun 		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
256*4882a593Smuzhiyun 		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
257*4882a593Smuzhiyun 		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
258*4882a593Smuzhiyun 		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
259*4882a593Smuzhiyun 	}
260*4882a593Smuzhiyun 
261*4882a593Smuzhiyun 	asm volatile("sfence" : : : "memory");
262*4882a593Smuzhiyun 	kernel_fpu_end();
263*4882a593Smuzhiyun }
264*4882a593Smuzhiyun 
265*4882a593Smuzhiyun const struct raid6_calls raid6_sse2x2 = {
266*4882a593Smuzhiyun 	raid6_sse22_gen_syndrome,
267*4882a593Smuzhiyun 	raid6_sse22_xor_syndrome,
268*4882a593Smuzhiyun 	raid6_have_sse2,
269*4882a593Smuzhiyun 	"sse2x2",
270*4882a593Smuzhiyun 	1			/* Has cache hints */
271*4882a593Smuzhiyun };
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun #ifdef CONFIG_X86_64
274*4882a593Smuzhiyun 
275*4882a593Smuzhiyun /*
276*4882a593Smuzhiyun  * Unrolled-by-4 SSE2 implementation
277*4882a593Smuzhiyun  */
raid6_sse24_gen_syndrome(int disks,size_t bytes,void ** ptrs)278*4882a593Smuzhiyun static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
279*4882a593Smuzhiyun {
280*4882a593Smuzhiyun 	u8 **dptr = (u8 **)ptrs;
281*4882a593Smuzhiyun 	u8 *p, *q;
282*4882a593Smuzhiyun 	int d, z, z0;
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun 	z0 = disks - 3;		/* Highest data disk */
285*4882a593Smuzhiyun 	p = dptr[z0+1];		/* XOR parity */
286*4882a593Smuzhiyun 	q = dptr[z0+2];		/* RS syndrome */
287*4882a593Smuzhiyun 
288*4882a593Smuzhiyun 	kernel_fpu_begin();
289*4882a593Smuzhiyun 
290*4882a593Smuzhiyun 	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
291*4882a593Smuzhiyun 	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
292*4882a593Smuzhiyun 	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
293*4882a593Smuzhiyun 	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
294*4882a593Smuzhiyun 	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
295*4882a593Smuzhiyun 	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
296*4882a593Smuzhiyun 	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
297*4882a593Smuzhiyun 	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
298*4882a593Smuzhiyun 	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
299*4882a593Smuzhiyun 	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
300*4882a593Smuzhiyun 	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
301*4882a593Smuzhiyun 	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
302*4882a593Smuzhiyun 	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
303*4882a593Smuzhiyun 
304*4882a593Smuzhiyun 	for ( d = 0 ; d < bytes ; d += 64 ) {
305*4882a593Smuzhiyun 		for ( z = z0 ; z >= 0 ; z-- ) {
306*4882a593Smuzhiyun 			/* The second prefetch seems to improve performance... */
307*4882a593Smuzhiyun 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
308*4882a593Smuzhiyun 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
309*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
310*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm6,%xmm7");
311*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm12,%xmm13");
312*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm14,%xmm15");
313*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
314*4882a593Smuzhiyun 			asm volatile("paddb %xmm6,%xmm6");
315*4882a593Smuzhiyun 			asm volatile("paddb %xmm12,%xmm12");
316*4882a593Smuzhiyun 			asm volatile("paddb %xmm14,%xmm14");
317*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
318*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm7");
319*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm13");
320*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm15");
321*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
322*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
323*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm12");
324*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm14");
325*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
326*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
327*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
328*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
329*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm2");
330*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm3");
331*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm10");
332*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm11");
333*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
334*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
335*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm12");
336*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm14");
337*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
338*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm7");
339*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm13");
340*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm15");
341*4882a593Smuzhiyun 		}
342*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
343*4882a593Smuzhiyun 		asm volatile("pxor %xmm2,%xmm2");
344*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
345*4882a593Smuzhiyun 		asm volatile("pxor %xmm3,%xmm3");
346*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
347*4882a593Smuzhiyun 		asm volatile("pxor %xmm10,%xmm10");
348*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
349*4882a593Smuzhiyun 		asm volatile("pxor %xmm11,%xmm11");
350*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
351*4882a593Smuzhiyun 		asm volatile("pxor %xmm4,%xmm4");
352*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
353*4882a593Smuzhiyun 		asm volatile("pxor %xmm6,%xmm6");
354*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
355*4882a593Smuzhiyun 		asm volatile("pxor %xmm12,%xmm12");
356*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
357*4882a593Smuzhiyun 		asm volatile("pxor %xmm14,%xmm14");
358*4882a593Smuzhiyun 	}
359*4882a593Smuzhiyun 
360*4882a593Smuzhiyun 	asm volatile("sfence" : : : "memory");
361*4882a593Smuzhiyun 	kernel_fpu_end();
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun 
raid6_sse24_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)364*4882a593Smuzhiyun static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
365*4882a593Smuzhiyun 				     size_t bytes, void **ptrs)
366*4882a593Smuzhiyun {
367*4882a593Smuzhiyun 	u8 **dptr = (u8 **)ptrs;
368*4882a593Smuzhiyun 	u8 *p, *q;
369*4882a593Smuzhiyun 	int d, z, z0;
370*4882a593Smuzhiyun 
371*4882a593Smuzhiyun 	z0 = stop;		/* P/Q right side optimization */
372*4882a593Smuzhiyun 	p = dptr[disks-2];	/* XOR parity */
373*4882a593Smuzhiyun 	q = dptr[disks-1];	/* RS syndrome */
374*4882a593Smuzhiyun 
375*4882a593Smuzhiyun 	kernel_fpu_begin();
376*4882a593Smuzhiyun 
377*4882a593Smuzhiyun 	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
378*4882a593Smuzhiyun 
379*4882a593Smuzhiyun 	for ( d = 0 ; d < bytes ; d += 64 ) {
380*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
381*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
382*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
383*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
384*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
385*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
386*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
387*4882a593Smuzhiyun 		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
388*4882a593Smuzhiyun 		asm volatile("pxor %xmm4,%xmm2");
389*4882a593Smuzhiyun 		asm volatile("pxor %xmm6,%xmm3");
390*4882a593Smuzhiyun 		asm volatile("pxor %xmm12,%xmm10");
391*4882a593Smuzhiyun 		asm volatile("pxor %xmm14,%xmm11");
392*4882a593Smuzhiyun 		/* P/Q data pages */
393*4882a593Smuzhiyun 		for ( z = z0-1 ; z >= start ; z-- ) {
394*4882a593Smuzhiyun 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
395*4882a593Smuzhiyun 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
396*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
397*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm7");
398*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm13");
399*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm15");
400*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
401*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm6,%xmm7");
402*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm12,%xmm13");
403*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm14,%xmm15");
404*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
405*4882a593Smuzhiyun 			asm volatile("paddb %xmm6,%xmm6");
406*4882a593Smuzhiyun 			asm volatile("paddb %xmm12,%xmm12");
407*4882a593Smuzhiyun 			asm volatile("paddb %xmm14,%xmm14");
408*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
409*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm7");
410*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm13");
411*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm15");
412*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
413*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
414*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm12");
415*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm14");
416*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
417*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
418*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
419*4882a593Smuzhiyun 			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
420*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm2");
421*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm3");
422*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm10");
423*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm11");
424*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
425*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
426*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm12");
427*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm14");
428*4882a593Smuzhiyun 		}
429*4882a593Smuzhiyun 		asm volatile("prefetchnta %0" :: "m" (q[d]));
430*4882a593Smuzhiyun 		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
431*4882a593Smuzhiyun 		/* P/Q left side optimization */
432*4882a593Smuzhiyun 		for ( z = start-1 ; z >= 0 ; z-- ) {
433*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm5");
434*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm7");
435*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm13");
436*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm15");
437*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm4,%xmm5");
438*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm6,%xmm7");
439*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm12,%xmm13");
440*4882a593Smuzhiyun 			asm volatile("pcmpgtb %xmm14,%xmm15");
441*4882a593Smuzhiyun 			asm volatile("paddb %xmm4,%xmm4");
442*4882a593Smuzhiyun 			asm volatile("paddb %xmm6,%xmm6");
443*4882a593Smuzhiyun 			asm volatile("paddb %xmm12,%xmm12");
444*4882a593Smuzhiyun 			asm volatile("paddb %xmm14,%xmm14");
445*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm5");
446*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm7");
447*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm13");
448*4882a593Smuzhiyun 			asm volatile("pand %xmm0,%xmm15");
449*4882a593Smuzhiyun 			asm volatile("pxor %xmm5,%xmm4");
450*4882a593Smuzhiyun 			asm volatile("pxor %xmm7,%xmm6");
451*4882a593Smuzhiyun 			asm volatile("pxor %xmm13,%xmm12");
452*4882a593Smuzhiyun 			asm volatile("pxor %xmm15,%xmm14");
453*4882a593Smuzhiyun 		}
454*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
455*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
456*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
457*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
458*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
459*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
460*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
461*4882a593Smuzhiyun 		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
462*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
463*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
464*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
465*4882a593Smuzhiyun 		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
466*4882a593Smuzhiyun 	}
467*4882a593Smuzhiyun 	asm volatile("sfence" : : : "memory");
468*4882a593Smuzhiyun 	kernel_fpu_end();
469*4882a593Smuzhiyun }
470*4882a593Smuzhiyun 
471*4882a593Smuzhiyun 
472*4882a593Smuzhiyun const struct raid6_calls raid6_sse2x4 = {
473*4882a593Smuzhiyun 	raid6_sse24_gen_syndrome,
474*4882a593Smuzhiyun 	raid6_sse24_xor_syndrome,
475*4882a593Smuzhiyun 	raid6_have_sse2,
476*4882a593Smuzhiyun 	"sse2x4",
477*4882a593Smuzhiyun 	1			/* Has cache hints */
478*4882a593Smuzhiyun };
479*4882a593Smuzhiyun 
480*4882a593Smuzhiyun #endif /* CONFIG_X86_64 */
481