xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/nh-sse2-x86_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright 2018 Google LLC
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com>
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#include <linux/linkage.h>
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun#define		PASS0_SUMS	%xmm0
13*4882a593Smuzhiyun#define		PASS1_SUMS	%xmm1
14*4882a593Smuzhiyun#define		PASS2_SUMS	%xmm2
15*4882a593Smuzhiyun#define		PASS3_SUMS	%xmm3
16*4882a593Smuzhiyun#define		K0		%xmm4
17*4882a593Smuzhiyun#define		K1		%xmm5
18*4882a593Smuzhiyun#define		K2		%xmm6
19*4882a593Smuzhiyun#define		K3		%xmm7
20*4882a593Smuzhiyun#define		T0		%xmm8
21*4882a593Smuzhiyun#define		T1		%xmm9
22*4882a593Smuzhiyun#define		T2		%xmm10
23*4882a593Smuzhiyun#define		T3		%xmm11
24*4882a593Smuzhiyun#define		T4		%xmm12
25*4882a593Smuzhiyun#define		T5		%xmm13
26*4882a593Smuzhiyun#define		T6		%xmm14
27*4882a593Smuzhiyun#define		T7		%xmm15
28*4882a593Smuzhiyun#define		KEY		%rdi
29*4882a593Smuzhiyun#define		MESSAGE		%rsi
30*4882a593Smuzhiyun#define		MESSAGE_LEN	%rdx
31*4882a593Smuzhiyun#define		HASH		%rcx
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun.macro _nh_stride	k0, k1, k2, k3, offset
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun	// Load next message stride
36*4882a593Smuzhiyun	movdqu		\offset(MESSAGE), T1
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun	// Load next key stride
39*4882a593Smuzhiyun	movdqu		\offset(KEY), \k3
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun	// Add message words to key words
42*4882a593Smuzhiyun	movdqa		T1, T2
43*4882a593Smuzhiyun	movdqa		T1, T3
44*4882a593Smuzhiyun	paddd		T1, \k0    // reuse k0 to avoid a move
45*4882a593Smuzhiyun	paddd		\k1, T1
46*4882a593Smuzhiyun	paddd		\k2, T2
47*4882a593Smuzhiyun	paddd		\k3, T3
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun	// Multiply 32x32 => 64 and accumulate
50*4882a593Smuzhiyun	pshufd		$0x10, \k0, T4
51*4882a593Smuzhiyun	pshufd		$0x32, \k0, \k0
52*4882a593Smuzhiyun	pshufd		$0x10, T1, T5
53*4882a593Smuzhiyun	pshufd		$0x32, T1, T1
54*4882a593Smuzhiyun	pshufd		$0x10, T2, T6
55*4882a593Smuzhiyun	pshufd		$0x32, T2, T2
56*4882a593Smuzhiyun	pshufd		$0x10, T3, T7
57*4882a593Smuzhiyun	pshufd		$0x32, T3, T3
58*4882a593Smuzhiyun	pmuludq		T4, \k0
59*4882a593Smuzhiyun	pmuludq		T5, T1
60*4882a593Smuzhiyun	pmuludq		T6, T2
61*4882a593Smuzhiyun	pmuludq		T7, T3
62*4882a593Smuzhiyun	paddq		\k0, PASS0_SUMS
63*4882a593Smuzhiyun	paddq		T1, PASS1_SUMS
64*4882a593Smuzhiyun	paddq		T2, PASS2_SUMS
65*4882a593Smuzhiyun	paddq		T3, PASS3_SUMS
66*4882a593Smuzhiyun.endm
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun/*
69*4882a593Smuzhiyun * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
70*4882a593Smuzhiyun *		u8 hash[NH_HASH_BYTES])
71*4882a593Smuzhiyun *
72*4882a593Smuzhiyun * It's guaranteed that message_len % 16 == 0.
73*4882a593Smuzhiyun */
74*4882a593SmuzhiyunSYM_FUNC_START(nh_sse2)
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun	movdqu		0x00(KEY), K0
77*4882a593Smuzhiyun	movdqu		0x10(KEY), K1
78*4882a593Smuzhiyun	movdqu		0x20(KEY), K2
79*4882a593Smuzhiyun	add		$0x30, KEY
80*4882a593Smuzhiyun	pxor		PASS0_SUMS, PASS0_SUMS
81*4882a593Smuzhiyun	pxor		PASS1_SUMS, PASS1_SUMS
82*4882a593Smuzhiyun	pxor		PASS2_SUMS, PASS2_SUMS
83*4882a593Smuzhiyun	pxor		PASS3_SUMS, PASS3_SUMS
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun	sub		$0x40, MESSAGE_LEN
86*4882a593Smuzhiyun	jl		.Lloop4_done
87*4882a593Smuzhiyun.Lloop4:
88*4882a593Smuzhiyun	_nh_stride	K0, K1, K2, K3, 0x00
89*4882a593Smuzhiyun	_nh_stride	K1, K2, K3, K0, 0x10
90*4882a593Smuzhiyun	_nh_stride	K2, K3, K0, K1, 0x20
91*4882a593Smuzhiyun	_nh_stride	K3, K0, K1, K2, 0x30
92*4882a593Smuzhiyun	add		$0x40, KEY
93*4882a593Smuzhiyun	add		$0x40, MESSAGE
94*4882a593Smuzhiyun	sub		$0x40, MESSAGE_LEN
95*4882a593Smuzhiyun	jge		.Lloop4
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun.Lloop4_done:
98*4882a593Smuzhiyun	and		$0x3f, MESSAGE_LEN
99*4882a593Smuzhiyun	jz		.Ldone
100*4882a593Smuzhiyun	_nh_stride	K0, K1, K2, K3, 0x00
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun	sub		$0x10, MESSAGE_LEN
103*4882a593Smuzhiyun	jz		.Ldone
104*4882a593Smuzhiyun	_nh_stride	K1, K2, K3, K0, 0x10
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun	sub		$0x10, MESSAGE_LEN
107*4882a593Smuzhiyun	jz		.Ldone
108*4882a593Smuzhiyun	_nh_stride	K2, K3, K0, K1, 0x20
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun.Ldone:
111*4882a593Smuzhiyun	// Sum the accumulators for each pass, then store the sums to 'hash'
112*4882a593Smuzhiyun	movdqa		PASS0_SUMS, T0
113*4882a593Smuzhiyun	movdqa		PASS2_SUMS, T1
114*4882a593Smuzhiyun	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
115*4882a593Smuzhiyun	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
116*4882a593Smuzhiyun	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
117*4882a593Smuzhiyun	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
118*4882a593Smuzhiyun	paddq		PASS0_SUMS, T0
119*4882a593Smuzhiyun	paddq		PASS2_SUMS, T1
120*4882a593Smuzhiyun	movdqu		T0, 0x00(HASH)
121*4882a593Smuzhiyun	movdqu		T1, 0x10(HASH)
122*4882a593Smuzhiyun	RET
123*4882a593SmuzhiyunSYM_FUNC_END(nh_sse2)
124