xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/sha1_ssse3_asm.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
4*4882a593Smuzhiyun * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
5*4882a593Smuzhiyun * processors. CPUs supporting Intel(R) AVX extensions will get an additional
6*4882a593Smuzhiyun * boost.
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * This work was inspired by the vectorized implementation of Dean Gaudet.
9*4882a593Smuzhiyun * Additional information on it can be found at:
10*4882a593Smuzhiyun *    http://www.arctic.org/~dean/crypto/sha1.html
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * It was improved upon with more efficient vectorization of the message
13*4882a593Smuzhiyun * scheduling. This implementation has also been optimized for all current and
14*4882a593Smuzhiyun * several future generations of Intel CPUs.
15*4882a593Smuzhiyun *
16*4882a593Smuzhiyun * See this article for more information about the implementation details:
17*4882a593Smuzhiyun *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
18*4882a593Smuzhiyun *
19*4882a593Smuzhiyun * Copyright (C) 2010, Intel Corp.
20*4882a593Smuzhiyun *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
21*4882a593Smuzhiyun *            Ronen Zohar <ronen.zohar@intel.com>
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
24*4882a593Smuzhiyun *   Author: Mathias Krause <minipli@googlemail.com>
25*4882a593Smuzhiyun */
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun#include <linux/linkage.h>
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun#define CTX	%rdi	// arg1
30*4882a593Smuzhiyun#define BUF	%rsi	// arg2
31*4882a593Smuzhiyun#define CNT	%rdx	// arg3
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun#define REG_A	%ecx
34*4882a593Smuzhiyun#define REG_B	%esi
35*4882a593Smuzhiyun#define REG_C	%edi
36*4882a593Smuzhiyun#define REG_D	%r12d
37*4882a593Smuzhiyun#define REG_E	%edx
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define REG_T1	%eax
40*4882a593Smuzhiyun#define REG_T2	%ebx
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun#define K_BASE		%r8
43*4882a593Smuzhiyun#define HASH_PTR	%r9
44*4882a593Smuzhiyun#define BUFFER_PTR	%r10
45*4882a593Smuzhiyun#define BUFFER_END	%r11
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun#define W_TMP1	%xmm0
48*4882a593Smuzhiyun#define W_TMP2	%xmm9
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#define W0	%xmm1
51*4882a593Smuzhiyun#define W4	%xmm2
52*4882a593Smuzhiyun#define W8	%xmm3
53*4882a593Smuzhiyun#define W12	%xmm4
54*4882a593Smuzhiyun#define W16	%xmm5
55*4882a593Smuzhiyun#define W20	%xmm6
56*4882a593Smuzhiyun#define W24	%xmm7
57*4882a593Smuzhiyun#define W28	%xmm8
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun#define XMM_SHUFB_BSWAP	%xmm10
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
62*4882a593Smuzhiyun#define WK(t)	(((t) & 15) * 4)(%rsp)
63*4882a593Smuzhiyun#define W_PRECALC_AHEAD	16
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun/*
66*4882a593Smuzhiyun * This macro implements the SHA-1 function's body for single 64-byte block
67*4882a593Smuzhiyun * param: function's name
68*4882a593Smuzhiyun */
69*4882a593Smuzhiyun.macro SHA1_VECTOR_ASM  name
70*4882a593Smuzhiyun	SYM_FUNC_START(\name)
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun	push	%rbx
73*4882a593Smuzhiyun	push	%r12
74*4882a593Smuzhiyun	push	%rbp
75*4882a593Smuzhiyun	mov	%rsp, %rbp
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun	sub	$64, %rsp		# allocate workspace
78*4882a593Smuzhiyun	and	$~15, %rsp		# align stack
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	mov	CTX, HASH_PTR
81*4882a593Smuzhiyun	mov	BUF, BUFFER_PTR
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	shl	$6, CNT			# multiply by 64
84*4882a593Smuzhiyun	add	BUF, CNT
85*4882a593Smuzhiyun	mov	CNT, BUFFER_END
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun	lea	K_XMM_AR(%rip), K_BASE
88*4882a593Smuzhiyun	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun	SHA1_PIPELINED_MAIN_BODY
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun	# cleanup workspace
93*4882a593Smuzhiyun	mov	$8, %ecx
94*4882a593Smuzhiyun	mov	%rsp, %rdi
95*4882a593Smuzhiyun	xor	%eax, %eax
96*4882a593Smuzhiyun	rep stosq
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun	mov	%rbp, %rsp		# deallocate workspace
99*4882a593Smuzhiyun	pop	%rbp
100*4882a593Smuzhiyun	pop	%r12
101*4882a593Smuzhiyun	pop	%rbx
102*4882a593Smuzhiyun	RET
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun	SYM_FUNC_END(\name)
105*4882a593Smuzhiyun.endm
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun/*
108*4882a593Smuzhiyun * This macro implements 80 rounds of SHA-1 for one 64-byte block
109*4882a593Smuzhiyun */
110*4882a593Smuzhiyun.macro SHA1_PIPELINED_MAIN_BODY
111*4882a593Smuzhiyun	INIT_REGALLOC
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun	mov	  (HASH_PTR), A
114*4882a593Smuzhiyun	mov	 4(HASH_PTR), B
115*4882a593Smuzhiyun	mov	 8(HASH_PTR), C
116*4882a593Smuzhiyun	mov	12(HASH_PTR), D
117*4882a593Smuzhiyun	mov	16(HASH_PTR), E
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun  .set i, 0
120*4882a593Smuzhiyun  .rept W_PRECALC_AHEAD
121*4882a593Smuzhiyun	W_PRECALC i
122*4882a593Smuzhiyun    .set i, (i+1)
123*4882a593Smuzhiyun  .endr
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun.align 4
126*4882a593Smuzhiyun1:
127*4882a593Smuzhiyun	RR F1,A,B,C,D,E,0
128*4882a593Smuzhiyun	RR F1,D,E,A,B,C,2
129*4882a593Smuzhiyun	RR F1,B,C,D,E,A,4
130*4882a593Smuzhiyun	RR F1,E,A,B,C,D,6
131*4882a593Smuzhiyun	RR F1,C,D,E,A,B,8
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun	RR F1,A,B,C,D,E,10
134*4882a593Smuzhiyun	RR F1,D,E,A,B,C,12
135*4882a593Smuzhiyun	RR F1,B,C,D,E,A,14
136*4882a593Smuzhiyun	RR F1,E,A,B,C,D,16
137*4882a593Smuzhiyun	RR F1,C,D,E,A,B,18
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun	RR F2,A,B,C,D,E,20
140*4882a593Smuzhiyun	RR F2,D,E,A,B,C,22
141*4882a593Smuzhiyun	RR F2,B,C,D,E,A,24
142*4882a593Smuzhiyun	RR F2,E,A,B,C,D,26
143*4882a593Smuzhiyun	RR F2,C,D,E,A,B,28
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun	RR F2,A,B,C,D,E,30
146*4882a593Smuzhiyun	RR F2,D,E,A,B,C,32
147*4882a593Smuzhiyun	RR F2,B,C,D,E,A,34
148*4882a593Smuzhiyun	RR F2,E,A,B,C,D,36
149*4882a593Smuzhiyun	RR F2,C,D,E,A,B,38
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	RR F3,A,B,C,D,E,40
152*4882a593Smuzhiyun	RR F3,D,E,A,B,C,42
153*4882a593Smuzhiyun	RR F3,B,C,D,E,A,44
154*4882a593Smuzhiyun	RR F3,E,A,B,C,D,46
155*4882a593Smuzhiyun	RR F3,C,D,E,A,B,48
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun	RR F3,A,B,C,D,E,50
158*4882a593Smuzhiyun	RR F3,D,E,A,B,C,52
159*4882a593Smuzhiyun	RR F3,B,C,D,E,A,54
160*4882a593Smuzhiyun	RR F3,E,A,B,C,D,56
161*4882a593Smuzhiyun	RR F3,C,D,E,A,B,58
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun	add	$64, BUFFER_PTR		# move to the next 64-byte block
164*4882a593Smuzhiyun	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
165*4882a593Smuzhiyun	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun	RR F4,A,B,C,D,E,60
168*4882a593Smuzhiyun	RR F4,D,E,A,B,C,62
169*4882a593Smuzhiyun	RR F4,B,C,D,E,A,64
170*4882a593Smuzhiyun	RR F4,E,A,B,C,D,66
171*4882a593Smuzhiyun	RR F4,C,D,E,A,B,68
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	RR F4,A,B,C,D,E,70
174*4882a593Smuzhiyun	RR F4,D,E,A,B,C,72
175*4882a593Smuzhiyun	RR F4,B,C,D,E,A,74
176*4882a593Smuzhiyun	RR F4,E,A,B,C,D,76
177*4882a593Smuzhiyun	RR F4,C,D,E,A,B,78
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun	UPDATE_HASH   (HASH_PTR), A
180*4882a593Smuzhiyun	UPDATE_HASH  4(HASH_PTR), B
181*4882a593Smuzhiyun	UPDATE_HASH  8(HASH_PTR), C
182*4882a593Smuzhiyun	UPDATE_HASH 12(HASH_PTR), D
183*4882a593Smuzhiyun	UPDATE_HASH 16(HASH_PTR), E
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun	RESTORE_RENAMED_REGS
186*4882a593Smuzhiyun	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
187*4882a593Smuzhiyun	jne	1b
188*4882a593Smuzhiyun.endm
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun.macro INIT_REGALLOC
191*4882a593Smuzhiyun  .set A, REG_A
192*4882a593Smuzhiyun  .set B, REG_B
193*4882a593Smuzhiyun  .set C, REG_C
194*4882a593Smuzhiyun  .set D, REG_D
195*4882a593Smuzhiyun  .set E, REG_E
196*4882a593Smuzhiyun  .set T1, REG_T1
197*4882a593Smuzhiyun  .set T2, REG_T2
198*4882a593Smuzhiyun.endm
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun.macro RESTORE_RENAMED_REGS
201*4882a593Smuzhiyun	# order is important (REG_C is where it should be)
202*4882a593Smuzhiyun	mov	B, REG_B
203*4882a593Smuzhiyun	mov	D, REG_D
204*4882a593Smuzhiyun	mov	A, REG_A
205*4882a593Smuzhiyun	mov	E, REG_E
206*4882a593Smuzhiyun.endm
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun.macro SWAP_REG_NAMES  a, b
209*4882a593Smuzhiyun  .set _T, \a
210*4882a593Smuzhiyun  .set \a, \b
211*4882a593Smuzhiyun  .set \b, _T
212*4882a593Smuzhiyun.endm
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun.macro F1  b, c, d
215*4882a593Smuzhiyun	mov	\c, T1
216*4882a593Smuzhiyun	SWAP_REG_NAMES \c, T1
217*4882a593Smuzhiyun	xor	\d, T1
218*4882a593Smuzhiyun	and	\b, T1
219*4882a593Smuzhiyun	xor	\d, T1
220*4882a593Smuzhiyun.endm
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun.macro F2  b, c, d
223*4882a593Smuzhiyun	mov	\d, T1
224*4882a593Smuzhiyun	SWAP_REG_NAMES \d, T1
225*4882a593Smuzhiyun	xor	\c, T1
226*4882a593Smuzhiyun	xor	\b, T1
227*4882a593Smuzhiyun.endm
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun.macro F3  b, c ,d
230*4882a593Smuzhiyun	mov	\c, T1
231*4882a593Smuzhiyun	SWAP_REG_NAMES \c, T1
232*4882a593Smuzhiyun	mov	\b, T2
233*4882a593Smuzhiyun	or	\b, T1
234*4882a593Smuzhiyun	and	\c, T2
235*4882a593Smuzhiyun	and	\d, T1
236*4882a593Smuzhiyun	or	T2, T1
237*4882a593Smuzhiyun.endm
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun.macro F4  b, c, d
240*4882a593Smuzhiyun	F2 \b, \c, \d
241*4882a593Smuzhiyun.endm
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun.macro UPDATE_HASH  hash, val
244*4882a593Smuzhiyun	add	\hash, \val
245*4882a593Smuzhiyun	mov	\val, \hash
246*4882a593Smuzhiyun.endm
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun/*
249*4882a593Smuzhiyun * RR does two rounds of SHA-1 back to back with W[] pre-calc
250*4882a593Smuzhiyun *   t1 = F(b, c, d);   e += w(i)
251*4882a593Smuzhiyun *   e += t1;           b <<= 30;   d  += w(i+1);
252*4882a593Smuzhiyun *   t1 = F(a, b, c);
253*4882a593Smuzhiyun *   d += t1;           a <<= 5;
254*4882a593Smuzhiyun *   e += a;
255*4882a593Smuzhiyun *   t1 = e;            a >>= 7;
256*4882a593Smuzhiyun *   t1 <<= 5;
257*4882a593Smuzhiyun *   d += t1;
258*4882a593Smuzhiyun */
259*4882a593Smuzhiyun.macro RR  F, a, b, c, d, e, round
260*4882a593Smuzhiyun	add	WK(\round), \e
261*4882a593Smuzhiyun	\F   \b, \c, \d		# t1 = F(b, c, d);
262*4882a593Smuzhiyun	W_PRECALC (\round + W_PRECALC_AHEAD)
263*4882a593Smuzhiyun	rol	$30, \b
264*4882a593Smuzhiyun	add	T1, \e
265*4882a593Smuzhiyun	add	WK(\round + 1), \d
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	\F   \a, \b, \c
268*4882a593Smuzhiyun	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
269*4882a593Smuzhiyun	rol	$5, \a
270*4882a593Smuzhiyun	add	\a, \e
271*4882a593Smuzhiyun	add	T1, \d
272*4882a593Smuzhiyun	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun	mov	\e, T1
275*4882a593Smuzhiyun	SWAP_REG_NAMES \e, T1
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	rol	$5, T1
278*4882a593Smuzhiyun	add	T1, \d
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun	# write:  \a, \b
281*4882a593Smuzhiyun	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
282*4882a593Smuzhiyun.endm
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun.macro W_PRECALC  r
285*4882a593Smuzhiyun  .set i, \r
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun  .if (i < 20)
288*4882a593Smuzhiyun    .set K_XMM, 0
289*4882a593Smuzhiyun  .elseif (i < 40)
290*4882a593Smuzhiyun    .set K_XMM, 16
291*4882a593Smuzhiyun  .elseif (i < 60)
292*4882a593Smuzhiyun    .set K_XMM, 32
293*4882a593Smuzhiyun  .elseif (i < 80)
294*4882a593Smuzhiyun    .set K_XMM, 48
295*4882a593Smuzhiyun  .endif
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
298*4882a593Smuzhiyun    .set i, ((\r) % 80)	    # pre-compute for the next iteration
299*4882a593Smuzhiyun    .if (i == 0)
300*4882a593Smuzhiyun	W_PRECALC_RESET
301*4882a593Smuzhiyun    .endif
302*4882a593Smuzhiyun	W_PRECALC_00_15
303*4882a593Smuzhiyun  .elseif (i<32)
304*4882a593Smuzhiyun	W_PRECALC_16_31
305*4882a593Smuzhiyun  .elseif (i < 80)   // rounds 32-79
306*4882a593Smuzhiyun	W_PRECALC_32_79
307*4882a593Smuzhiyun  .endif
308*4882a593Smuzhiyun.endm
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun.macro W_PRECALC_RESET
311*4882a593Smuzhiyun  .set W,          W0
312*4882a593Smuzhiyun  .set W_minus_04, W4
313*4882a593Smuzhiyun  .set W_minus_08, W8
314*4882a593Smuzhiyun  .set W_minus_12, W12
315*4882a593Smuzhiyun  .set W_minus_16, W16
316*4882a593Smuzhiyun  .set W_minus_20, W20
317*4882a593Smuzhiyun  .set W_minus_24, W24
318*4882a593Smuzhiyun  .set W_minus_28, W28
319*4882a593Smuzhiyun  .set W_minus_32, W
320*4882a593Smuzhiyun.endm
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun.macro W_PRECALC_ROTATE
323*4882a593Smuzhiyun  .set W_minus_32, W_minus_28
324*4882a593Smuzhiyun  .set W_minus_28, W_minus_24
325*4882a593Smuzhiyun  .set W_minus_24, W_minus_20
326*4882a593Smuzhiyun  .set W_minus_20, W_minus_16
327*4882a593Smuzhiyun  .set W_minus_16, W_minus_12
328*4882a593Smuzhiyun  .set W_minus_12, W_minus_08
329*4882a593Smuzhiyun  .set W_minus_08, W_minus_04
330*4882a593Smuzhiyun  .set W_minus_04, W
331*4882a593Smuzhiyun  .set W,          W_minus_32
332*4882a593Smuzhiyun.endm
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun.macro W_PRECALC_SSSE3
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun.macro W_PRECALC_00_15
337*4882a593Smuzhiyun	W_PRECALC_00_15_SSSE3
338*4882a593Smuzhiyun.endm
339*4882a593Smuzhiyun.macro W_PRECALC_16_31
340*4882a593Smuzhiyun	W_PRECALC_16_31_SSSE3
341*4882a593Smuzhiyun.endm
342*4882a593Smuzhiyun.macro W_PRECALC_32_79
343*4882a593Smuzhiyun	W_PRECALC_32_79_SSSE3
344*4882a593Smuzhiyun.endm
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun/* message scheduling pre-compute for rounds 0-15 */
347*4882a593Smuzhiyun.macro W_PRECALC_00_15_SSSE3
348*4882a593Smuzhiyun  .if ((i & 3) == 0)
349*4882a593Smuzhiyun	movdqu	(i*4)(BUFFER_PTR), W_TMP1
350*4882a593Smuzhiyun  .elseif ((i & 3) == 1)
351*4882a593Smuzhiyun	pshufb	XMM_SHUFB_BSWAP, W_TMP1
352*4882a593Smuzhiyun	movdqa	W_TMP1, W
353*4882a593Smuzhiyun  .elseif ((i & 3) == 2)
354*4882a593Smuzhiyun	paddd	(K_BASE), W_TMP1
355*4882a593Smuzhiyun  .elseif ((i & 3) == 3)
356*4882a593Smuzhiyun	movdqa  W_TMP1, WK(i&~3)
357*4882a593Smuzhiyun	W_PRECALC_ROTATE
358*4882a593Smuzhiyun  .endif
359*4882a593Smuzhiyun.endm
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun/* message scheduling pre-compute for rounds 16-31
362*4882a593Smuzhiyun *
363*4882a593Smuzhiyun * - calculating last 32 w[i] values in 8 XMM registers
364*4882a593Smuzhiyun * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
365*4882a593Smuzhiyun *   instruction
366*4882a593Smuzhiyun *
367*4882a593Smuzhiyun * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
368*4882a593Smuzhiyun * dependency, but improves for 32-79
369*4882a593Smuzhiyun */
370*4882a593Smuzhiyun.macro W_PRECALC_16_31_SSSE3
371*4882a593Smuzhiyun  # blended scheduling of vector and scalar instruction streams, one 4-wide
372*4882a593Smuzhiyun  # vector iteration / 4 scalar rounds
373*4882a593Smuzhiyun  .if ((i & 3) == 0)
374*4882a593Smuzhiyun	movdqa	W_minus_12, W
375*4882a593Smuzhiyun	palignr	$8, W_minus_16, W	# w[i-14]
376*4882a593Smuzhiyun	movdqa	W_minus_04, W_TMP1
377*4882a593Smuzhiyun	psrldq	$4, W_TMP1		# w[i-3]
378*4882a593Smuzhiyun	pxor	W_minus_08, W
379*4882a593Smuzhiyun  .elseif ((i & 3) == 1)
380*4882a593Smuzhiyun	pxor	W_minus_16, W_TMP1
381*4882a593Smuzhiyun	pxor	W_TMP1, W
382*4882a593Smuzhiyun	movdqa	W, W_TMP2
383*4882a593Smuzhiyun	movdqa	W, W_TMP1
384*4882a593Smuzhiyun	pslldq	$12, W_TMP2
385*4882a593Smuzhiyun  .elseif ((i & 3) == 2)
386*4882a593Smuzhiyun	psrld	$31, W
387*4882a593Smuzhiyun	pslld	$1, W_TMP1
388*4882a593Smuzhiyun	por	W, W_TMP1
389*4882a593Smuzhiyun	movdqa	W_TMP2, W
390*4882a593Smuzhiyun	psrld	$30, W_TMP2
391*4882a593Smuzhiyun	pslld	$2, W
392*4882a593Smuzhiyun  .elseif ((i & 3) == 3)
393*4882a593Smuzhiyun	pxor	W, W_TMP1
394*4882a593Smuzhiyun	pxor	W_TMP2, W_TMP1
395*4882a593Smuzhiyun	movdqa	W_TMP1, W
396*4882a593Smuzhiyun	paddd	K_XMM(K_BASE), W_TMP1
397*4882a593Smuzhiyun	movdqa	W_TMP1, WK(i&~3)
398*4882a593Smuzhiyun	W_PRECALC_ROTATE
399*4882a593Smuzhiyun  .endif
400*4882a593Smuzhiyun.endm
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun/* message scheduling pre-compute for rounds 32-79
403*4882a593Smuzhiyun *
404*4882a593Smuzhiyun * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
405*4882a593Smuzhiyun * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
406*4882a593Smuzhiyun * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
407*4882a593Smuzhiyun */
408*4882a593Smuzhiyun.macro W_PRECALC_32_79_SSSE3
409*4882a593Smuzhiyun  .if ((i & 3) == 0)
410*4882a593Smuzhiyun	movdqa	W_minus_04, W_TMP1
411*4882a593Smuzhiyun	pxor	W_minus_28, W		# W is W_minus_32 before xor
412*4882a593Smuzhiyun	palignr	$8, W_minus_08, W_TMP1
413*4882a593Smuzhiyun  .elseif ((i & 3) == 1)
414*4882a593Smuzhiyun	pxor	W_minus_16, W
415*4882a593Smuzhiyun	pxor	W_TMP1, W
416*4882a593Smuzhiyun	movdqa	W, W_TMP1
417*4882a593Smuzhiyun  .elseif ((i & 3) == 2)
418*4882a593Smuzhiyun	psrld	$30, W
419*4882a593Smuzhiyun	pslld	$2, W_TMP1
420*4882a593Smuzhiyun	por	W, W_TMP1
421*4882a593Smuzhiyun  .elseif ((i & 3) == 3)
422*4882a593Smuzhiyun	movdqa	W_TMP1, W
423*4882a593Smuzhiyun	paddd	K_XMM(K_BASE), W_TMP1
424*4882a593Smuzhiyun	movdqa	W_TMP1, WK(i&~3)
425*4882a593Smuzhiyun	W_PRECALC_ROTATE
426*4882a593Smuzhiyun  .endif
427*4882a593Smuzhiyun.endm
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun.endm		// W_PRECALC_SSSE3
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun#define K1	0x5a827999
433*4882a593Smuzhiyun#define K2	0x6ed9eba1
434*4882a593Smuzhiyun#define K3	0x8f1bbcdc
435*4882a593Smuzhiyun#define K4	0xca62c1d6
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun.section .rodata
438*4882a593Smuzhiyun.align 16
439*4882a593Smuzhiyun
440*4882a593SmuzhiyunK_XMM_AR:
441*4882a593Smuzhiyun	.long K1, K1, K1, K1
442*4882a593Smuzhiyun	.long K2, K2, K2, K2
443*4882a593Smuzhiyun	.long K3, K3, K3, K3
444*4882a593Smuzhiyun	.long K4, K4, K4, K4
445*4882a593Smuzhiyun
446*4882a593SmuzhiyunBSWAP_SHUFB_CTL:
447*4882a593Smuzhiyun	.long 0x00010203
448*4882a593Smuzhiyun	.long 0x04050607
449*4882a593Smuzhiyun	.long 0x08090a0b
450*4882a593Smuzhiyun	.long 0x0c0d0e0f
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun.section .text
454*4882a593Smuzhiyun
455*4882a593SmuzhiyunW_PRECALC_SSSE3
456*4882a593Smuzhiyun.macro xmm_mov a, b
457*4882a593Smuzhiyun	movdqu	\a,\b
458*4882a593Smuzhiyun.endm
459*4882a593Smuzhiyun
460*4882a593Smuzhiyun/*
461*4882a593Smuzhiyun * SSSE3 optimized implementation:
462*4882a593Smuzhiyun *
463*4882a593Smuzhiyun * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
464*4882a593Smuzhiyun *					const u8 *data, int blocks);
465*4882a593Smuzhiyun *
466*4882a593Smuzhiyun * Note that struct sha1_state is assumed to begin with u32 state[5].
467*4882a593Smuzhiyun */
468*4882a593SmuzhiyunSHA1_VECTOR_ASM     sha1_transform_ssse3
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun.macro W_PRECALC_AVX
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun.purgem W_PRECALC_00_15
473*4882a593Smuzhiyun.macro  W_PRECALC_00_15
474*4882a593Smuzhiyun    W_PRECALC_00_15_AVX
475*4882a593Smuzhiyun.endm
476*4882a593Smuzhiyun.purgem W_PRECALC_16_31
477*4882a593Smuzhiyun.macro  W_PRECALC_16_31
478*4882a593Smuzhiyun    W_PRECALC_16_31_AVX
479*4882a593Smuzhiyun.endm
480*4882a593Smuzhiyun.purgem W_PRECALC_32_79
481*4882a593Smuzhiyun.macro  W_PRECALC_32_79
482*4882a593Smuzhiyun    W_PRECALC_32_79_AVX
483*4882a593Smuzhiyun.endm
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun.macro W_PRECALC_00_15_AVX
486*4882a593Smuzhiyun  .if ((i & 3) == 0)
487*4882a593Smuzhiyun	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
488*4882a593Smuzhiyun  .elseif ((i & 3) == 1)
489*4882a593Smuzhiyun	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
490*4882a593Smuzhiyun  .elseif ((i & 3) == 2)
491*4882a593Smuzhiyun	vpaddd	(K_BASE), W, W_TMP1
492*4882a593Smuzhiyun  .elseif ((i & 3) == 3)
493*4882a593Smuzhiyun	vmovdqa	W_TMP1, WK(i&~3)
494*4882a593Smuzhiyun	W_PRECALC_ROTATE
495*4882a593Smuzhiyun  .endif
496*4882a593Smuzhiyun.endm
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun.macro W_PRECALC_16_31_AVX
499*4882a593Smuzhiyun  .if ((i & 3) == 0)
500*4882a593Smuzhiyun	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
501*4882a593Smuzhiyun	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
502*4882a593Smuzhiyun	vpxor	W_minus_08, W, W
503*4882a593Smuzhiyun	vpxor	W_minus_16, W_TMP1, W_TMP1
504*4882a593Smuzhiyun  .elseif ((i & 3) == 1)
505*4882a593Smuzhiyun	vpxor	W_TMP1, W, W
506*4882a593Smuzhiyun	vpslldq	$12, W, W_TMP2
507*4882a593Smuzhiyun	vpslld	$1, W, W_TMP1
508*4882a593Smuzhiyun  .elseif ((i & 3) == 2)
509*4882a593Smuzhiyun	vpsrld	$31, W, W
510*4882a593Smuzhiyun	vpor	W, W_TMP1, W_TMP1
511*4882a593Smuzhiyun	vpslld	$2, W_TMP2, W
512*4882a593Smuzhiyun	vpsrld	$30, W_TMP2, W_TMP2
513*4882a593Smuzhiyun  .elseif ((i & 3) == 3)
514*4882a593Smuzhiyun	vpxor	W, W_TMP1, W_TMP1
515*4882a593Smuzhiyun	vpxor	W_TMP2, W_TMP1, W
516*4882a593Smuzhiyun	vpaddd	K_XMM(K_BASE), W, W_TMP1
517*4882a593Smuzhiyun	vmovdqu	W_TMP1, WK(i&~3)
518*4882a593Smuzhiyun	W_PRECALC_ROTATE
519*4882a593Smuzhiyun  .endif
520*4882a593Smuzhiyun.endm
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun.macro W_PRECALC_32_79_AVX
523*4882a593Smuzhiyun  .if ((i & 3) == 0)
524*4882a593Smuzhiyun	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
525*4882a593Smuzhiyun	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
526*4882a593Smuzhiyun  .elseif ((i & 3) == 1)
527*4882a593Smuzhiyun	vpxor	W_minus_16, W_TMP1, W_TMP1
528*4882a593Smuzhiyun	vpxor	W_TMP1, W, W
529*4882a593Smuzhiyun  .elseif ((i & 3) == 2)
530*4882a593Smuzhiyun	vpslld	$2, W, W_TMP1
531*4882a593Smuzhiyun	vpsrld	$30, W, W
532*4882a593Smuzhiyun	vpor	W, W_TMP1, W
533*4882a593Smuzhiyun  .elseif ((i & 3) == 3)
534*4882a593Smuzhiyun	vpaddd	K_XMM(K_BASE), W, W_TMP1
535*4882a593Smuzhiyun	vmovdqu	W_TMP1, WK(i&~3)
536*4882a593Smuzhiyun	W_PRECALC_ROTATE
537*4882a593Smuzhiyun  .endif
538*4882a593Smuzhiyun.endm
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun.endm    // W_PRECALC_AVX
541*4882a593Smuzhiyun
542*4882a593SmuzhiyunW_PRECALC_AVX
543*4882a593Smuzhiyun.purgem xmm_mov
544*4882a593Smuzhiyun.macro xmm_mov a, b
545*4882a593Smuzhiyun	vmovdqu	\a,\b
546*4882a593Smuzhiyun.endm
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun/* AVX optimized implementation:
550*4882a593Smuzhiyun *  extern "C" void sha1_transform_avx(struct sha1_state *state,
551*4882a593Smuzhiyun *				       const u8 *data, int blocks);
552*4882a593Smuzhiyun */
553*4882a593SmuzhiyunSHA1_VECTOR_ASM     sha1_transform_avx
554