xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/blake2s-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4*4882a593Smuzhiyun * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun#include <linux/linkage.h>
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10*4882a593Smuzhiyun.align 32
11*4882a593SmuzhiyunIV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
12*4882a593Smuzhiyun	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
13*4882a593Smuzhiyun.section .rodata.cst16.ROT16, "aM", @progbits, 16
14*4882a593Smuzhiyun.align 16
15*4882a593SmuzhiyunROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
16*4882a593Smuzhiyun.section .rodata.cst16.ROR328, "aM", @progbits, 16
17*4882a593Smuzhiyun.align 16
18*4882a593SmuzhiyunROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
19*4882a593Smuzhiyun.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20*4882a593Smuzhiyun.align 64
21*4882a593SmuzhiyunSIGMA:
22*4882a593Smuzhiyun.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
23*4882a593Smuzhiyun.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
24*4882a593Smuzhiyun.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
25*4882a593Smuzhiyun.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
26*4882a593Smuzhiyun.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
27*4882a593Smuzhiyun.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
28*4882a593Smuzhiyun.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
29*4882a593Smuzhiyun.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
30*4882a593Smuzhiyun.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
31*4882a593Smuzhiyun.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
32*4882a593Smuzhiyun#ifdef CONFIG_AS_AVX512
33*4882a593Smuzhiyun.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
34*4882a593Smuzhiyun.align 64
35*4882a593SmuzhiyunSIGMA2:
36*4882a593Smuzhiyun.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
37*4882a593Smuzhiyun.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
38*4882a593Smuzhiyun.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
39*4882a593Smuzhiyun.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
40*4882a593Smuzhiyun.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
41*4882a593Smuzhiyun.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
42*4882a593Smuzhiyun.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
43*4882a593Smuzhiyun.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
44*4882a593Smuzhiyun.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
45*4882a593Smuzhiyun.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
46*4882a593Smuzhiyun#endif /* CONFIG_AS_AVX512 */
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun.text
49*4882a593SmuzhiyunSYM_FUNC_START(blake2s_compress_ssse3)
50*4882a593Smuzhiyun	testq		%rdx,%rdx
51*4882a593Smuzhiyun	je		.Lendofloop
52*4882a593Smuzhiyun	movdqu		(%rdi),%xmm0
53*4882a593Smuzhiyun	movdqu		0x10(%rdi),%xmm1
54*4882a593Smuzhiyun	movdqa		ROT16(%rip),%xmm12
55*4882a593Smuzhiyun	movdqa		ROR328(%rip),%xmm13
56*4882a593Smuzhiyun	movdqu		0x20(%rdi),%xmm14
57*4882a593Smuzhiyun	movq		%rcx,%xmm15
58*4882a593Smuzhiyun	leaq		SIGMA+0xa0(%rip),%r8
59*4882a593Smuzhiyun	jmp		.Lbeginofloop
60*4882a593Smuzhiyun	.align		32
61*4882a593Smuzhiyun.Lbeginofloop:
62*4882a593Smuzhiyun	movdqa		%xmm0,%xmm10
63*4882a593Smuzhiyun	movdqa		%xmm1,%xmm11
64*4882a593Smuzhiyun	paddq		%xmm15,%xmm14
65*4882a593Smuzhiyun	movdqa		IV(%rip),%xmm2
66*4882a593Smuzhiyun	movdqa		%xmm14,%xmm3
67*4882a593Smuzhiyun	pxor		IV+0x10(%rip),%xmm3
68*4882a593Smuzhiyun	leaq		SIGMA(%rip),%rcx
69*4882a593Smuzhiyun.Lroundloop:
70*4882a593Smuzhiyun	movzbl		(%rcx),%eax
71*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm4
72*4882a593Smuzhiyun	movzbl		0x1(%rcx),%eax
73*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm5
74*4882a593Smuzhiyun	movzbl		0x2(%rcx),%eax
75*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm6
76*4882a593Smuzhiyun	movzbl		0x3(%rcx),%eax
77*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm7
78*4882a593Smuzhiyun	punpckldq	%xmm5,%xmm4
79*4882a593Smuzhiyun	punpckldq	%xmm7,%xmm6
80*4882a593Smuzhiyun	punpcklqdq	%xmm6,%xmm4
81*4882a593Smuzhiyun	paddd		%xmm4,%xmm0
82*4882a593Smuzhiyun	paddd		%xmm1,%xmm0
83*4882a593Smuzhiyun	pxor		%xmm0,%xmm3
84*4882a593Smuzhiyun	pshufb		%xmm12,%xmm3
85*4882a593Smuzhiyun	paddd		%xmm3,%xmm2
86*4882a593Smuzhiyun	pxor		%xmm2,%xmm1
87*4882a593Smuzhiyun	movdqa		%xmm1,%xmm8
88*4882a593Smuzhiyun	psrld		$0xc,%xmm1
89*4882a593Smuzhiyun	pslld		$0x14,%xmm8
90*4882a593Smuzhiyun	por		%xmm8,%xmm1
91*4882a593Smuzhiyun	movzbl		0x4(%rcx),%eax
92*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm5
93*4882a593Smuzhiyun	movzbl		0x5(%rcx),%eax
94*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm6
95*4882a593Smuzhiyun	movzbl		0x6(%rcx),%eax
96*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm7
97*4882a593Smuzhiyun	movzbl		0x7(%rcx),%eax
98*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm4
99*4882a593Smuzhiyun	punpckldq	%xmm6,%xmm5
100*4882a593Smuzhiyun	punpckldq	%xmm4,%xmm7
101*4882a593Smuzhiyun	punpcklqdq	%xmm7,%xmm5
102*4882a593Smuzhiyun	paddd		%xmm5,%xmm0
103*4882a593Smuzhiyun	paddd		%xmm1,%xmm0
104*4882a593Smuzhiyun	pxor		%xmm0,%xmm3
105*4882a593Smuzhiyun	pshufb		%xmm13,%xmm3
106*4882a593Smuzhiyun	paddd		%xmm3,%xmm2
107*4882a593Smuzhiyun	pxor		%xmm2,%xmm1
108*4882a593Smuzhiyun	movdqa		%xmm1,%xmm8
109*4882a593Smuzhiyun	psrld		$0x7,%xmm1
110*4882a593Smuzhiyun	pslld		$0x19,%xmm8
111*4882a593Smuzhiyun	por		%xmm8,%xmm1
112*4882a593Smuzhiyun	pshufd		$0x93,%xmm0,%xmm0
113*4882a593Smuzhiyun	pshufd		$0x4e,%xmm3,%xmm3
114*4882a593Smuzhiyun	pshufd		$0x39,%xmm2,%xmm2
115*4882a593Smuzhiyun	movzbl		0x8(%rcx),%eax
116*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm6
117*4882a593Smuzhiyun	movzbl		0x9(%rcx),%eax
118*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm7
119*4882a593Smuzhiyun	movzbl		0xa(%rcx),%eax
120*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm4
121*4882a593Smuzhiyun	movzbl		0xb(%rcx),%eax
122*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm5
123*4882a593Smuzhiyun	punpckldq	%xmm7,%xmm6
124*4882a593Smuzhiyun	punpckldq	%xmm5,%xmm4
125*4882a593Smuzhiyun	punpcklqdq	%xmm4,%xmm6
126*4882a593Smuzhiyun	paddd		%xmm6,%xmm0
127*4882a593Smuzhiyun	paddd		%xmm1,%xmm0
128*4882a593Smuzhiyun	pxor		%xmm0,%xmm3
129*4882a593Smuzhiyun	pshufb		%xmm12,%xmm3
130*4882a593Smuzhiyun	paddd		%xmm3,%xmm2
131*4882a593Smuzhiyun	pxor		%xmm2,%xmm1
132*4882a593Smuzhiyun	movdqa		%xmm1,%xmm8
133*4882a593Smuzhiyun	psrld		$0xc,%xmm1
134*4882a593Smuzhiyun	pslld		$0x14,%xmm8
135*4882a593Smuzhiyun	por		%xmm8,%xmm1
136*4882a593Smuzhiyun	movzbl		0xc(%rcx),%eax
137*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm7
138*4882a593Smuzhiyun	movzbl		0xd(%rcx),%eax
139*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm4
140*4882a593Smuzhiyun	movzbl		0xe(%rcx),%eax
141*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm5
142*4882a593Smuzhiyun	movzbl		0xf(%rcx),%eax
143*4882a593Smuzhiyun	movd		(%rsi,%rax,4),%xmm6
144*4882a593Smuzhiyun	punpckldq	%xmm4,%xmm7
145*4882a593Smuzhiyun	punpckldq	%xmm6,%xmm5
146*4882a593Smuzhiyun	punpcklqdq	%xmm5,%xmm7
147*4882a593Smuzhiyun	paddd		%xmm7,%xmm0
148*4882a593Smuzhiyun	paddd		%xmm1,%xmm0
149*4882a593Smuzhiyun	pxor		%xmm0,%xmm3
150*4882a593Smuzhiyun	pshufb		%xmm13,%xmm3
151*4882a593Smuzhiyun	paddd		%xmm3,%xmm2
152*4882a593Smuzhiyun	pxor		%xmm2,%xmm1
153*4882a593Smuzhiyun	movdqa		%xmm1,%xmm8
154*4882a593Smuzhiyun	psrld		$0x7,%xmm1
155*4882a593Smuzhiyun	pslld		$0x19,%xmm8
156*4882a593Smuzhiyun	por		%xmm8,%xmm1
157*4882a593Smuzhiyun	pshufd		$0x39,%xmm0,%xmm0
158*4882a593Smuzhiyun	pshufd		$0x4e,%xmm3,%xmm3
159*4882a593Smuzhiyun	pshufd		$0x93,%xmm2,%xmm2
160*4882a593Smuzhiyun	addq		$0x10,%rcx
161*4882a593Smuzhiyun	cmpq		%r8,%rcx
162*4882a593Smuzhiyun	jnz		.Lroundloop
163*4882a593Smuzhiyun	pxor		%xmm2,%xmm0
164*4882a593Smuzhiyun	pxor		%xmm3,%xmm1
165*4882a593Smuzhiyun	pxor		%xmm10,%xmm0
166*4882a593Smuzhiyun	pxor		%xmm11,%xmm1
167*4882a593Smuzhiyun	addq		$0x40,%rsi
168*4882a593Smuzhiyun	decq		%rdx
169*4882a593Smuzhiyun	jnz		.Lbeginofloop
170*4882a593Smuzhiyun	movdqu		%xmm0,(%rdi)
171*4882a593Smuzhiyun	movdqu		%xmm1,0x10(%rdi)
172*4882a593Smuzhiyun	movdqu		%xmm14,0x20(%rdi)
173*4882a593Smuzhiyun.Lendofloop:
174*4882a593Smuzhiyun	RET
175*4882a593SmuzhiyunSYM_FUNC_END(blake2s_compress_ssse3)
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun#ifdef CONFIG_AS_AVX512
178*4882a593SmuzhiyunSYM_FUNC_START(blake2s_compress_avx512)
179*4882a593Smuzhiyun	vmovdqu		(%rdi),%xmm0
180*4882a593Smuzhiyun	vmovdqu		0x10(%rdi),%xmm1
181*4882a593Smuzhiyun	vmovdqu		0x20(%rdi),%xmm4
182*4882a593Smuzhiyun	vmovq		%rcx,%xmm5
183*4882a593Smuzhiyun	vmovdqa		IV(%rip),%xmm14
184*4882a593Smuzhiyun	vmovdqa		IV+16(%rip),%xmm15
185*4882a593Smuzhiyun	jmp		.Lblake2s_compress_avx512_mainloop
186*4882a593Smuzhiyun.align 32
187*4882a593Smuzhiyun.Lblake2s_compress_avx512_mainloop:
188*4882a593Smuzhiyun	vmovdqa		%xmm0,%xmm10
189*4882a593Smuzhiyun	vmovdqa		%xmm1,%xmm11
190*4882a593Smuzhiyun	vpaddq		%xmm5,%xmm4,%xmm4
191*4882a593Smuzhiyun	vmovdqa		%xmm14,%xmm2
192*4882a593Smuzhiyun	vpxor		%xmm15,%xmm4,%xmm3
193*4882a593Smuzhiyun	vmovdqu		(%rsi),%ymm6
194*4882a593Smuzhiyun	vmovdqu		0x20(%rsi),%ymm7
195*4882a593Smuzhiyun	addq		$0x40,%rsi
196*4882a593Smuzhiyun	leaq		SIGMA2(%rip),%rax
197*4882a593Smuzhiyun	movb		$0xa,%cl
198*4882a593Smuzhiyun.Lblake2s_compress_avx512_roundloop:
199*4882a593Smuzhiyun	addq		$0x40,%rax
200*4882a593Smuzhiyun	vmovdqa		-0x40(%rax),%ymm8
201*4882a593Smuzhiyun	vmovdqa		-0x20(%rax),%ymm9
202*4882a593Smuzhiyun	vpermi2d	%ymm7,%ymm6,%ymm8
203*4882a593Smuzhiyun	vpermi2d	%ymm7,%ymm6,%ymm9
204*4882a593Smuzhiyun	vmovdqa		%ymm8,%ymm6
205*4882a593Smuzhiyun	vmovdqa		%ymm9,%ymm7
206*4882a593Smuzhiyun	vpaddd		%xmm8,%xmm0,%xmm0
207*4882a593Smuzhiyun	vpaddd		%xmm1,%xmm0,%xmm0
208*4882a593Smuzhiyun	vpxor		%xmm0,%xmm3,%xmm3
209*4882a593Smuzhiyun	vprord		$0x10,%xmm3,%xmm3
210*4882a593Smuzhiyun	vpaddd		%xmm3,%xmm2,%xmm2
211*4882a593Smuzhiyun	vpxor		%xmm2,%xmm1,%xmm1
212*4882a593Smuzhiyun	vprord		$0xc,%xmm1,%xmm1
213*4882a593Smuzhiyun	vextracti128	$0x1,%ymm8,%xmm8
214*4882a593Smuzhiyun	vpaddd		%xmm8,%xmm0,%xmm0
215*4882a593Smuzhiyun	vpaddd		%xmm1,%xmm0,%xmm0
216*4882a593Smuzhiyun	vpxor		%xmm0,%xmm3,%xmm3
217*4882a593Smuzhiyun	vprord		$0x8,%xmm3,%xmm3
218*4882a593Smuzhiyun	vpaddd		%xmm3,%xmm2,%xmm2
219*4882a593Smuzhiyun	vpxor		%xmm2,%xmm1,%xmm1
220*4882a593Smuzhiyun	vprord		$0x7,%xmm1,%xmm1
221*4882a593Smuzhiyun	vpshufd		$0x93,%xmm0,%xmm0
222*4882a593Smuzhiyun	vpshufd		$0x4e,%xmm3,%xmm3
223*4882a593Smuzhiyun	vpshufd		$0x39,%xmm2,%xmm2
224*4882a593Smuzhiyun	vpaddd		%xmm9,%xmm0,%xmm0
225*4882a593Smuzhiyun	vpaddd		%xmm1,%xmm0,%xmm0
226*4882a593Smuzhiyun	vpxor		%xmm0,%xmm3,%xmm3
227*4882a593Smuzhiyun	vprord		$0x10,%xmm3,%xmm3
228*4882a593Smuzhiyun	vpaddd		%xmm3,%xmm2,%xmm2
229*4882a593Smuzhiyun	vpxor		%xmm2,%xmm1,%xmm1
230*4882a593Smuzhiyun	vprord		$0xc,%xmm1,%xmm1
231*4882a593Smuzhiyun	vextracti128	$0x1,%ymm9,%xmm9
232*4882a593Smuzhiyun	vpaddd		%xmm9,%xmm0,%xmm0
233*4882a593Smuzhiyun	vpaddd		%xmm1,%xmm0,%xmm0
234*4882a593Smuzhiyun	vpxor		%xmm0,%xmm3,%xmm3
235*4882a593Smuzhiyun	vprord		$0x8,%xmm3,%xmm3
236*4882a593Smuzhiyun	vpaddd		%xmm3,%xmm2,%xmm2
237*4882a593Smuzhiyun	vpxor		%xmm2,%xmm1,%xmm1
238*4882a593Smuzhiyun	vprord		$0x7,%xmm1,%xmm1
239*4882a593Smuzhiyun	vpshufd		$0x39,%xmm0,%xmm0
240*4882a593Smuzhiyun	vpshufd		$0x4e,%xmm3,%xmm3
241*4882a593Smuzhiyun	vpshufd		$0x93,%xmm2,%xmm2
242*4882a593Smuzhiyun	decb		%cl
243*4882a593Smuzhiyun	jne		.Lblake2s_compress_avx512_roundloop
244*4882a593Smuzhiyun	vpxor		%xmm10,%xmm0,%xmm0
245*4882a593Smuzhiyun	vpxor		%xmm11,%xmm1,%xmm1
246*4882a593Smuzhiyun	vpxor		%xmm2,%xmm0,%xmm0
247*4882a593Smuzhiyun	vpxor		%xmm3,%xmm1,%xmm1
248*4882a593Smuzhiyun	decq		%rdx
249*4882a593Smuzhiyun	jne		.Lblake2s_compress_avx512_mainloop
250*4882a593Smuzhiyun	vmovdqu		%xmm0,(%rdi)
251*4882a593Smuzhiyun	vmovdqu		%xmm1,0x10(%rdi)
252*4882a593Smuzhiyun	vmovdqu		%xmm4,0x20(%rdi)
253*4882a593Smuzhiyun	vzeroupper
254*4882a593Smuzhiyun	RET
255*4882a593SmuzhiyunSYM_FUNC_END(blake2s_compress_avx512)
256*4882a593Smuzhiyun#endif /* CONFIG_AS_AVX512 */
257