xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/nh-neon-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * NH - ε-almost-universal hash function, NEON accelerated version
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright 2018 Google LLC
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com>
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#include <linux/linkage.h>
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun	.text
13*4882a593Smuzhiyun	.fpu		neon
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun	KEY		.req	r0
16*4882a593Smuzhiyun	MESSAGE		.req	r1
17*4882a593Smuzhiyun	MESSAGE_LEN	.req	r2
18*4882a593Smuzhiyun	HASH		.req	r3
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun	PASS0_SUMS	.req	q0
21*4882a593Smuzhiyun	PASS0_SUM_A	.req	d0
22*4882a593Smuzhiyun	PASS0_SUM_B	.req	d1
23*4882a593Smuzhiyun	PASS1_SUMS	.req	q1
24*4882a593Smuzhiyun	PASS1_SUM_A	.req	d2
25*4882a593Smuzhiyun	PASS1_SUM_B	.req	d3
26*4882a593Smuzhiyun	PASS2_SUMS	.req	q2
27*4882a593Smuzhiyun	PASS2_SUM_A	.req	d4
28*4882a593Smuzhiyun	PASS2_SUM_B	.req	d5
29*4882a593Smuzhiyun	PASS3_SUMS	.req	q3
30*4882a593Smuzhiyun	PASS3_SUM_A	.req	d6
31*4882a593Smuzhiyun	PASS3_SUM_B	.req	d7
32*4882a593Smuzhiyun	K0		.req	q4
33*4882a593Smuzhiyun	K1		.req	q5
34*4882a593Smuzhiyun	K2		.req	q6
35*4882a593Smuzhiyun	K3		.req	q7
36*4882a593Smuzhiyun	T0		.req	q8
37*4882a593Smuzhiyun	T0_L		.req	d16
38*4882a593Smuzhiyun	T0_H		.req	d17
39*4882a593Smuzhiyun	T1		.req	q9
40*4882a593Smuzhiyun	T1_L		.req	d18
41*4882a593Smuzhiyun	T1_H		.req	d19
42*4882a593Smuzhiyun	T2		.req	q10
43*4882a593Smuzhiyun	T2_L		.req	d20
44*4882a593Smuzhiyun	T2_H		.req	d21
45*4882a593Smuzhiyun	T3		.req	q11
46*4882a593Smuzhiyun	T3_L		.req	d22
47*4882a593Smuzhiyun	T3_H		.req	d23
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun.macro _nh_stride	k0, k1, k2, k3
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun	// Load next message stride
52*4882a593Smuzhiyun	vld1.8		{T3}, [MESSAGE]!
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun	// Load next key stride
55*4882a593Smuzhiyun	vld1.32		{\k3}, [KEY]!
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun	// Add message words to key words
58*4882a593Smuzhiyun	vadd.u32	T0, T3, \k0
59*4882a593Smuzhiyun	vadd.u32	T1, T3, \k1
60*4882a593Smuzhiyun	vadd.u32	T2, T3, \k2
61*4882a593Smuzhiyun	vadd.u32	T3, T3, \k3
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun	// Multiply 32x32 => 64 and accumulate
64*4882a593Smuzhiyun	vmlal.u32	PASS0_SUMS, T0_L, T0_H
65*4882a593Smuzhiyun	vmlal.u32	PASS1_SUMS, T1_L, T1_H
66*4882a593Smuzhiyun	vmlal.u32	PASS2_SUMS, T2_L, T2_H
67*4882a593Smuzhiyun	vmlal.u32	PASS3_SUMS, T3_L, T3_H
68*4882a593Smuzhiyun.endm
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun/*
71*4882a593Smuzhiyun * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
72*4882a593Smuzhiyun *		u8 hash[NH_HASH_BYTES])
73*4882a593Smuzhiyun *
74*4882a593Smuzhiyun * It's guaranteed that message_len % 16 == 0.
75*4882a593Smuzhiyun */
76*4882a593SmuzhiyunENTRY(nh_neon)
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun	vld1.32		{K0,K1}, [KEY]!
79*4882a593Smuzhiyun	  vmov.u64	PASS0_SUMS, #0
80*4882a593Smuzhiyun	  vmov.u64	PASS1_SUMS, #0
81*4882a593Smuzhiyun	vld1.32		{K2}, [KEY]!
82*4882a593Smuzhiyun	  vmov.u64	PASS2_SUMS, #0
83*4882a593Smuzhiyun	  vmov.u64	PASS3_SUMS, #0
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #64
86*4882a593Smuzhiyun	blt		.Lloop4_done
87*4882a593Smuzhiyun.Lloop4:
88*4882a593Smuzhiyun	_nh_stride	K0, K1, K2, K3
89*4882a593Smuzhiyun	_nh_stride	K1, K2, K3, K0
90*4882a593Smuzhiyun	_nh_stride	K2, K3, K0, K1
91*4882a593Smuzhiyun	_nh_stride	K3, K0, K1, K2
92*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #64
93*4882a593Smuzhiyun	bge		.Lloop4
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun.Lloop4_done:
96*4882a593Smuzhiyun	ands		MESSAGE_LEN, MESSAGE_LEN, #63
97*4882a593Smuzhiyun	beq		.Ldone
98*4882a593Smuzhiyun	_nh_stride	K0, K1, K2, K3
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #16
101*4882a593Smuzhiyun	beq		.Ldone
102*4882a593Smuzhiyun	_nh_stride	K1, K2, K3, K0
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun	subs		MESSAGE_LEN, MESSAGE_LEN, #16
105*4882a593Smuzhiyun	beq		.Ldone
106*4882a593Smuzhiyun	_nh_stride	K2, K3, K0, K1
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun.Ldone:
109*4882a593Smuzhiyun	// Sum the accumulators for each pass, then store the sums to 'hash'
110*4882a593Smuzhiyun	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
111*4882a593Smuzhiyun	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
112*4882a593Smuzhiyun	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
113*4882a593Smuzhiyun	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
114*4882a593Smuzhiyun	vst1.8		{T0-T1}, [HASH]
115*4882a593Smuzhiyun	bx		lr
116*4882a593SmuzhiyunENDPROC(nh_neon)
117