1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4*4882a593Smuzhiyun * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun#include <linux/linkage.h> 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 10*4882a593Smuzhiyun.align 32 11*4882a593SmuzhiyunIV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 12*4882a593Smuzhiyun .octa 0x5BE0CD191F83D9AB9B05688C510E527F 13*4882a593Smuzhiyun.section .rodata.cst16.ROT16, "aM", @progbits, 16 14*4882a593Smuzhiyun.align 16 15*4882a593SmuzhiyunROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 16*4882a593Smuzhiyun.section .rodata.cst16.ROR328, "aM", @progbits, 16 17*4882a593Smuzhiyun.align 16 18*4882a593SmuzhiyunROR328: .octa 0x0C0F0E0D080B0A090407060500030201 19*4882a593Smuzhiyun.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 20*4882a593Smuzhiyun.align 64 21*4882a593SmuzhiyunSIGMA: 22*4882a593Smuzhiyun.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 23*4882a593Smuzhiyun.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 24*4882a593Smuzhiyun.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 25*4882a593Smuzhiyun.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 26*4882a593Smuzhiyun.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 27*4882a593Smuzhiyun.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 28*4882a593Smuzhiyun.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 29*4882a593Smuzhiyun.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 30*4882a593Smuzhiyun.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 31*4882a593Smuzhiyun.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 32*4882a593Smuzhiyun#ifdef CONFIG_AS_AVX512 33*4882a593Smuzhiyun.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 34*4882a593Smuzhiyun.align 64 35*4882a593SmuzhiyunSIGMA2: 36*4882a593Smuzhiyun.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 37*4882a593Smuzhiyun.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 38*4882a593Smuzhiyun.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 39*4882a593Smuzhiyun.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 40*4882a593Smuzhiyun.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 41*4882a593Smuzhiyun.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 42*4882a593Smuzhiyun.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 43*4882a593Smuzhiyun.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 44*4882a593Smuzhiyun.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 45*4882a593Smuzhiyun.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 46*4882a593Smuzhiyun#endif /* CONFIG_AS_AVX512 */ 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun.text 49*4882a593SmuzhiyunSYM_FUNC_START(blake2s_compress_ssse3) 50*4882a593Smuzhiyun testq %rdx,%rdx 51*4882a593Smuzhiyun je .Lendofloop 52*4882a593Smuzhiyun movdqu (%rdi),%xmm0 53*4882a593Smuzhiyun movdqu 0x10(%rdi),%xmm1 54*4882a593Smuzhiyun movdqa ROT16(%rip),%xmm12 55*4882a593Smuzhiyun movdqa ROR328(%rip),%xmm13 56*4882a593Smuzhiyun movdqu 0x20(%rdi),%xmm14 57*4882a593Smuzhiyun movq %rcx,%xmm15 58*4882a593Smuzhiyun leaq SIGMA+0xa0(%rip),%r8 59*4882a593Smuzhiyun jmp .Lbeginofloop 60*4882a593Smuzhiyun .align 32 61*4882a593Smuzhiyun.Lbeginofloop: 62*4882a593Smuzhiyun movdqa %xmm0,%xmm10 63*4882a593Smuzhiyun movdqa %xmm1,%xmm11 64*4882a593Smuzhiyun paddq %xmm15,%xmm14 65*4882a593Smuzhiyun movdqa IV(%rip),%xmm2 66*4882a593Smuzhiyun movdqa %xmm14,%xmm3 67*4882a593Smuzhiyun pxor IV+0x10(%rip),%xmm3 68*4882a593Smuzhiyun leaq SIGMA(%rip),%rcx 69*4882a593Smuzhiyun.Lroundloop: 70*4882a593Smuzhiyun movzbl (%rcx),%eax 71*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm4 72*4882a593Smuzhiyun movzbl 0x1(%rcx),%eax 73*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm5 74*4882a593Smuzhiyun movzbl 0x2(%rcx),%eax 75*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm6 76*4882a593Smuzhiyun movzbl 0x3(%rcx),%eax 77*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm7 78*4882a593Smuzhiyun punpckldq %xmm5,%xmm4 79*4882a593Smuzhiyun punpckldq %xmm7,%xmm6 80*4882a593Smuzhiyun punpcklqdq %xmm6,%xmm4 81*4882a593Smuzhiyun paddd %xmm4,%xmm0 82*4882a593Smuzhiyun paddd %xmm1,%xmm0 83*4882a593Smuzhiyun pxor %xmm0,%xmm3 84*4882a593Smuzhiyun pshufb %xmm12,%xmm3 85*4882a593Smuzhiyun paddd %xmm3,%xmm2 86*4882a593Smuzhiyun pxor %xmm2,%xmm1 87*4882a593Smuzhiyun movdqa %xmm1,%xmm8 88*4882a593Smuzhiyun psrld $0xc,%xmm1 89*4882a593Smuzhiyun pslld $0x14,%xmm8 90*4882a593Smuzhiyun por %xmm8,%xmm1 91*4882a593Smuzhiyun movzbl 0x4(%rcx),%eax 92*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm5 93*4882a593Smuzhiyun movzbl 0x5(%rcx),%eax 94*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm6 95*4882a593Smuzhiyun movzbl 0x6(%rcx),%eax 96*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm7 97*4882a593Smuzhiyun movzbl 0x7(%rcx),%eax 98*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm4 99*4882a593Smuzhiyun punpckldq %xmm6,%xmm5 100*4882a593Smuzhiyun punpckldq %xmm4,%xmm7 101*4882a593Smuzhiyun punpcklqdq %xmm7,%xmm5 102*4882a593Smuzhiyun paddd %xmm5,%xmm0 103*4882a593Smuzhiyun paddd %xmm1,%xmm0 104*4882a593Smuzhiyun pxor %xmm0,%xmm3 105*4882a593Smuzhiyun pshufb %xmm13,%xmm3 106*4882a593Smuzhiyun paddd %xmm3,%xmm2 107*4882a593Smuzhiyun pxor %xmm2,%xmm1 108*4882a593Smuzhiyun movdqa %xmm1,%xmm8 109*4882a593Smuzhiyun psrld $0x7,%xmm1 110*4882a593Smuzhiyun pslld $0x19,%xmm8 111*4882a593Smuzhiyun por %xmm8,%xmm1 112*4882a593Smuzhiyun pshufd $0x93,%xmm0,%xmm0 113*4882a593Smuzhiyun pshufd $0x4e,%xmm3,%xmm3 114*4882a593Smuzhiyun pshufd $0x39,%xmm2,%xmm2 115*4882a593Smuzhiyun movzbl 0x8(%rcx),%eax 116*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm6 117*4882a593Smuzhiyun movzbl 0x9(%rcx),%eax 118*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm7 119*4882a593Smuzhiyun movzbl 0xa(%rcx),%eax 120*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm4 121*4882a593Smuzhiyun movzbl 0xb(%rcx),%eax 122*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm5 123*4882a593Smuzhiyun punpckldq %xmm7,%xmm6 124*4882a593Smuzhiyun punpckldq %xmm5,%xmm4 125*4882a593Smuzhiyun punpcklqdq %xmm4,%xmm6 126*4882a593Smuzhiyun paddd %xmm6,%xmm0 127*4882a593Smuzhiyun paddd %xmm1,%xmm0 128*4882a593Smuzhiyun pxor %xmm0,%xmm3 129*4882a593Smuzhiyun pshufb %xmm12,%xmm3 130*4882a593Smuzhiyun paddd %xmm3,%xmm2 131*4882a593Smuzhiyun pxor %xmm2,%xmm1 132*4882a593Smuzhiyun movdqa %xmm1,%xmm8 133*4882a593Smuzhiyun psrld $0xc,%xmm1 134*4882a593Smuzhiyun pslld $0x14,%xmm8 135*4882a593Smuzhiyun por %xmm8,%xmm1 136*4882a593Smuzhiyun movzbl 0xc(%rcx),%eax 137*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm7 138*4882a593Smuzhiyun movzbl 0xd(%rcx),%eax 139*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm4 140*4882a593Smuzhiyun movzbl 0xe(%rcx),%eax 141*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm5 142*4882a593Smuzhiyun movzbl 0xf(%rcx),%eax 143*4882a593Smuzhiyun movd (%rsi,%rax,4),%xmm6 144*4882a593Smuzhiyun punpckldq %xmm4,%xmm7 145*4882a593Smuzhiyun punpckldq %xmm6,%xmm5 146*4882a593Smuzhiyun punpcklqdq %xmm5,%xmm7 147*4882a593Smuzhiyun paddd %xmm7,%xmm0 148*4882a593Smuzhiyun paddd %xmm1,%xmm0 149*4882a593Smuzhiyun pxor %xmm0,%xmm3 150*4882a593Smuzhiyun pshufb %xmm13,%xmm3 151*4882a593Smuzhiyun paddd %xmm3,%xmm2 152*4882a593Smuzhiyun pxor %xmm2,%xmm1 153*4882a593Smuzhiyun movdqa %xmm1,%xmm8 154*4882a593Smuzhiyun psrld $0x7,%xmm1 155*4882a593Smuzhiyun pslld $0x19,%xmm8 156*4882a593Smuzhiyun por %xmm8,%xmm1 157*4882a593Smuzhiyun pshufd $0x39,%xmm0,%xmm0 158*4882a593Smuzhiyun pshufd $0x4e,%xmm3,%xmm3 159*4882a593Smuzhiyun pshufd $0x93,%xmm2,%xmm2 160*4882a593Smuzhiyun addq $0x10,%rcx 161*4882a593Smuzhiyun cmpq %r8,%rcx 162*4882a593Smuzhiyun jnz .Lroundloop 163*4882a593Smuzhiyun pxor %xmm2,%xmm0 164*4882a593Smuzhiyun pxor %xmm3,%xmm1 165*4882a593Smuzhiyun pxor %xmm10,%xmm0 166*4882a593Smuzhiyun pxor %xmm11,%xmm1 167*4882a593Smuzhiyun addq $0x40,%rsi 168*4882a593Smuzhiyun decq %rdx 169*4882a593Smuzhiyun jnz .Lbeginofloop 170*4882a593Smuzhiyun movdqu %xmm0,(%rdi) 171*4882a593Smuzhiyun movdqu %xmm1,0x10(%rdi) 172*4882a593Smuzhiyun movdqu %xmm14,0x20(%rdi) 173*4882a593Smuzhiyun.Lendofloop: 174*4882a593Smuzhiyun RET 175*4882a593SmuzhiyunSYM_FUNC_END(blake2s_compress_ssse3) 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun#ifdef CONFIG_AS_AVX512 178*4882a593SmuzhiyunSYM_FUNC_START(blake2s_compress_avx512) 179*4882a593Smuzhiyun vmovdqu (%rdi),%xmm0 180*4882a593Smuzhiyun vmovdqu 0x10(%rdi),%xmm1 181*4882a593Smuzhiyun vmovdqu 0x20(%rdi),%xmm4 182*4882a593Smuzhiyun vmovq %rcx,%xmm5 183*4882a593Smuzhiyun vmovdqa IV(%rip),%xmm14 184*4882a593Smuzhiyun vmovdqa IV+16(%rip),%xmm15 185*4882a593Smuzhiyun jmp .Lblake2s_compress_avx512_mainloop 186*4882a593Smuzhiyun.align 32 187*4882a593Smuzhiyun.Lblake2s_compress_avx512_mainloop: 188*4882a593Smuzhiyun vmovdqa %xmm0,%xmm10 189*4882a593Smuzhiyun vmovdqa %xmm1,%xmm11 190*4882a593Smuzhiyun vpaddq %xmm5,%xmm4,%xmm4 191*4882a593Smuzhiyun vmovdqa %xmm14,%xmm2 192*4882a593Smuzhiyun vpxor %xmm15,%xmm4,%xmm3 193*4882a593Smuzhiyun vmovdqu (%rsi),%ymm6 194*4882a593Smuzhiyun vmovdqu 0x20(%rsi),%ymm7 195*4882a593Smuzhiyun addq $0x40,%rsi 196*4882a593Smuzhiyun leaq SIGMA2(%rip),%rax 197*4882a593Smuzhiyun movb $0xa,%cl 198*4882a593Smuzhiyun.Lblake2s_compress_avx512_roundloop: 199*4882a593Smuzhiyun addq $0x40,%rax 200*4882a593Smuzhiyun vmovdqa -0x40(%rax),%ymm8 201*4882a593Smuzhiyun vmovdqa -0x20(%rax),%ymm9 202*4882a593Smuzhiyun vpermi2d %ymm7,%ymm6,%ymm8 203*4882a593Smuzhiyun vpermi2d %ymm7,%ymm6,%ymm9 204*4882a593Smuzhiyun vmovdqa %ymm8,%ymm6 205*4882a593Smuzhiyun vmovdqa %ymm9,%ymm7 206*4882a593Smuzhiyun vpaddd %xmm8,%xmm0,%xmm0 207*4882a593Smuzhiyun vpaddd %xmm1,%xmm0,%xmm0 208*4882a593Smuzhiyun vpxor %xmm0,%xmm3,%xmm3 209*4882a593Smuzhiyun vprord $0x10,%xmm3,%xmm3 210*4882a593Smuzhiyun vpaddd %xmm3,%xmm2,%xmm2 211*4882a593Smuzhiyun vpxor %xmm2,%xmm1,%xmm1 212*4882a593Smuzhiyun vprord $0xc,%xmm1,%xmm1 213*4882a593Smuzhiyun vextracti128 $0x1,%ymm8,%xmm8 214*4882a593Smuzhiyun vpaddd %xmm8,%xmm0,%xmm0 215*4882a593Smuzhiyun vpaddd %xmm1,%xmm0,%xmm0 216*4882a593Smuzhiyun vpxor %xmm0,%xmm3,%xmm3 217*4882a593Smuzhiyun vprord $0x8,%xmm3,%xmm3 218*4882a593Smuzhiyun vpaddd %xmm3,%xmm2,%xmm2 219*4882a593Smuzhiyun vpxor %xmm2,%xmm1,%xmm1 220*4882a593Smuzhiyun vprord $0x7,%xmm1,%xmm1 221*4882a593Smuzhiyun vpshufd $0x93,%xmm0,%xmm0 222*4882a593Smuzhiyun vpshufd $0x4e,%xmm3,%xmm3 223*4882a593Smuzhiyun vpshufd $0x39,%xmm2,%xmm2 224*4882a593Smuzhiyun vpaddd %xmm9,%xmm0,%xmm0 225*4882a593Smuzhiyun vpaddd %xmm1,%xmm0,%xmm0 226*4882a593Smuzhiyun vpxor %xmm0,%xmm3,%xmm3 227*4882a593Smuzhiyun vprord $0x10,%xmm3,%xmm3 228*4882a593Smuzhiyun vpaddd %xmm3,%xmm2,%xmm2 229*4882a593Smuzhiyun vpxor %xmm2,%xmm1,%xmm1 230*4882a593Smuzhiyun vprord $0xc,%xmm1,%xmm1 231*4882a593Smuzhiyun vextracti128 $0x1,%ymm9,%xmm9 232*4882a593Smuzhiyun vpaddd %xmm9,%xmm0,%xmm0 233*4882a593Smuzhiyun vpaddd %xmm1,%xmm0,%xmm0 234*4882a593Smuzhiyun vpxor %xmm0,%xmm3,%xmm3 235*4882a593Smuzhiyun vprord $0x8,%xmm3,%xmm3 236*4882a593Smuzhiyun vpaddd %xmm3,%xmm2,%xmm2 237*4882a593Smuzhiyun vpxor %xmm2,%xmm1,%xmm1 238*4882a593Smuzhiyun vprord $0x7,%xmm1,%xmm1 239*4882a593Smuzhiyun vpshufd $0x39,%xmm0,%xmm0 240*4882a593Smuzhiyun vpshufd $0x4e,%xmm3,%xmm3 241*4882a593Smuzhiyun vpshufd $0x93,%xmm2,%xmm2 242*4882a593Smuzhiyun decb %cl 243*4882a593Smuzhiyun jne .Lblake2s_compress_avx512_roundloop 244*4882a593Smuzhiyun vpxor %xmm10,%xmm0,%xmm0 245*4882a593Smuzhiyun vpxor %xmm11,%xmm1,%xmm1 246*4882a593Smuzhiyun vpxor %xmm2,%xmm0,%xmm0 247*4882a593Smuzhiyun vpxor %xmm3,%xmm1,%xmm1 248*4882a593Smuzhiyun decq %rdx 249*4882a593Smuzhiyun jne .Lblake2s_compress_avx512_mainloop 250*4882a593Smuzhiyun vmovdqu %xmm0,(%rdi) 251*4882a593Smuzhiyun vmovdqu %xmm1,0x10(%rdi) 252*4882a593Smuzhiyun vmovdqu %xmm4,0x20(%rdi) 253*4882a593Smuzhiyun vzeroupper 254*4882a593Smuzhiyun RET 255*4882a593SmuzhiyunSYM_FUNC_END(blake2s_compress_avx512) 256*4882a593Smuzhiyun#endif /* CONFIG_AS_AVX512 */ 257