xref: /OK3568_Linux_fs/kernel/arch/arm64/crypto/nh-neon-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright 2018 Google LLC
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com>
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#include <linux/linkage.h>
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun	KEY		.req	x0
13*4882a593Smuzhiyun	MESSAGE		.req	x1
14*4882a593Smuzhiyun	MESSAGE_LEN	.req	x2
15*4882a593Smuzhiyun	HASH		.req	x3
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun	PASS0_SUMS	.req	v0
18*4882a593Smuzhiyun	PASS1_SUMS	.req	v1
19*4882a593Smuzhiyun	PASS2_SUMS	.req	v2
20*4882a593Smuzhiyun	PASS3_SUMS	.req	v3
21*4882a593Smuzhiyun	K0		.req	v4
22*4882a593Smuzhiyun	K1		.req	v5
23*4882a593Smuzhiyun	K2		.req	v6
24*4882a593Smuzhiyun	K3		.req	v7
25*4882a593Smuzhiyun	T0		.req	v8
26*4882a593Smuzhiyun	T1		.req	v9
27*4882a593Smuzhiyun	T2		.req	v10
28*4882a593Smuzhiyun	T3		.req	v11
29*4882a593Smuzhiyun	T4		.req	v12
30*4882a593Smuzhiyun	T5		.req	v13
31*4882a593Smuzhiyun	T6		.req	v14
32*4882a593Smuzhiyun	T7		.req	v15
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun.macro _nh_stride	k0, k1, k2, k3
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun	// Load next message stride
37*4882a593Smuzhiyun	ld1		{T3.16b}, [MESSAGE], #16
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun	// Load next key stride
40*4882a593Smuzhiyun	ld1		{\k3\().4s}, [KEY], #16
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	// Add message words to key words
43*4882a593Smuzhiyun	add		T0.4s, T3.4s, \k0\().4s
44*4882a593Smuzhiyun	add		T1.4s, T3.4s, \k1\().4s
45*4882a593Smuzhiyun	add		T2.4s, T3.4s, \k2\().4s
46*4882a593Smuzhiyun	add		T3.4s, T3.4s, \k3\().4s
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun	// Multiply 32x32 => 64 and accumulate
49*4882a593Smuzhiyun	mov		T4.d[0], T0.d[1]
50*4882a593Smuzhiyun	mov		T5.d[0], T1.d[1]
51*4882a593Smuzhiyun	mov		T6.d[0], T2.d[1]
52*4882a593Smuzhiyun	mov		T7.d[0], T3.d[1]
53*4882a593Smuzhiyun	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
54*4882a593Smuzhiyun	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
55*4882a593Smuzhiyun	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
56*4882a593Smuzhiyun	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
57*4882a593Smuzhiyun.endm
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun/*
60*4882a593Smuzhiyun * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
61*4882a593Smuzhiyun *		u8 hash[NH_HASH_BYTES])
62*4882a593Smuzhiyun *
63*4882a593Smuzhiyun * It's guaranteed that message_len % 16 == 0.
64*4882a593Smuzhiyun */
65*4882a593SmuzhiyunSYM_FUNC_START(nh_neon)
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun	ld1		{K0.4s,K1.4s}, [KEY], #32
68*4882a593Smuzhiyun	  movi		PASS0_SUMS.2d, #0
69*4882a593Smuzhiyun	  movi		PASS1_SUMS.2d, #0
70*4882a593Smuzhiyun	ld1		{K2.4s}, [KEY], #16
71*4882a593Smuzhiyun	  movi		PASS2_SUMS.2d, #0
72*4882a593Smuzhiyun	  movi		PASS3_SUMS.2d, #0
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #64
75*4882a593Smuzhiyun	blt		.Lloop4_done
76*4882a593Smuzhiyun.Lloop4:
77*4882a593Smuzhiyun	_nh_stride	K0, K1, K2, K3
78*4882a593Smuzhiyun	_nh_stride	K1, K2, K3, K0
79*4882a593Smuzhiyun	_nh_stride	K2, K3, K0, K1
80*4882a593Smuzhiyun	_nh_stride	K3, K0, K1, K2
81*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #64
82*4882a593Smuzhiyun	bge		.Lloop4
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun.Lloop4_done:
85*4882a593Smuzhiyun	ands		MESSAGE_LEN, MESSAGE_LEN, #63
86*4882a593Smuzhiyun	beq		.Ldone
87*4882a593Smuzhiyun	_nh_stride	K0, K1, K2, K3
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #16
90*4882a593Smuzhiyun	beq		.Ldone
91*4882a593Smuzhiyun	_nh_stride	K1, K2, K3, K0
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #16
94*4882a593Smuzhiyun	beq		.Ldone
95*4882a593Smuzhiyun	_nh_stride	K2, K3, K0, K1
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun.Ldone:
98*4882a593Smuzhiyun	// Sum the accumulators for each pass, then store the sums to 'hash'
99*4882a593Smuzhiyun	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
100*4882a593Smuzhiyun	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
101*4882a593Smuzhiyun	st1		{T0.16b,T1.16b}, [HASH]
102*4882a593Smuzhiyun	ret
103*4882a593SmuzhiyunSYM_FUNC_END(nh_neon)
104