xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/ghash-ce-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/assembler.h>
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun	.arch		armv8-a
12*4882a593Smuzhiyun	.fpu		crypto-neon-fp-armv8
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun	SHASH		.req	q0
15*4882a593Smuzhiyun	T1		.req	q1
16*4882a593Smuzhiyun	XL		.req	q2
17*4882a593Smuzhiyun	XM		.req	q3
18*4882a593Smuzhiyun	XH		.req	q4
19*4882a593Smuzhiyun	IN1		.req	q4
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun	SHASH_L		.req	d0
22*4882a593Smuzhiyun	SHASH_H		.req	d1
23*4882a593Smuzhiyun	T1_L		.req	d2
24*4882a593Smuzhiyun	T1_H		.req	d3
25*4882a593Smuzhiyun	XL_L		.req	d4
26*4882a593Smuzhiyun	XL_H		.req	d5
27*4882a593Smuzhiyun	XM_L		.req	d6
28*4882a593Smuzhiyun	XM_H		.req	d7
29*4882a593Smuzhiyun	XH_L		.req	d8
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun	t0l		.req	d10
32*4882a593Smuzhiyun	t0h		.req	d11
33*4882a593Smuzhiyun	t1l		.req	d12
34*4882a593Smuzhiyun	t1h		.req	d13
35*4882a593Smuzhiyun	t2l		.req	d14
36*4882a593Smuzhiyun	t2h		.req	d15
37*4882a593Smuzhiyun	t3l		.req	d16
38*4882a593Smuzhiyun	t3h		.req	d17
39*4882a593Smuzhiyun	t4l		.req	d18
40*4882a593Smuzhiyun	t4h		.req	d19
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	t0q		.req	q5
43*4882a593Smuzhiyun	t1q		.req	q6
44*4882a593Smuzhiyun	t2q		.req	q7
45*4882a593Smuzhiyun	t3q		.req	q8
46*4882a593Smuzhiyun	t4q		.req	q9
47*4882a593Smuzhiyun	T2		.req	q9
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun	s1l		.req	d20
50*4882a593Smuzhiyun	s1h		.req	d21
51*4882a593Smuzhiyun	s2l		.req	d22
52*4882a593Smuzhiyun	s2h		.req	d23
53*4882a593Smuzhiyun	s3l		.req	d24
54*4882a593Smuzhiyun	s3h		.req	d25
55*4882a593Smuzhiyun	s4l		.req	d26
56*4882a593Smuzhiyun	s4h		.req	d27
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun	MASK		.req	d28
59*4882a593Smuzhiyun	SHASH2_p8	.req	d28
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun	k16		.req	d29
62*4882a593Smuzhiyun	k32		.req	d30
63*4882a593Smuzhiyun	k48		.req	d31
64*4882a593Smuzhiyun	SHASH2_p64	.req	d31
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun	HH		.req	q10
67*4882a593Smuzhiyun	HH3		.req	q11
68*4882a593Smuzhiyun	HH4		.req	q12
69*4882a593Smuzhiyun	HH34		.req	q13
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun	HH_L		.req	d20
72*4882a593Smuzhiyun	HH_H		.req	d21
73*4882a593Smuzhiyun	HH3_L		.req	d22
74*4882a593Smuzhiyun	HH3_H		.req	d23
75*4882a593Smuzhiyun	HH4_L		.req	d24
76*4882a593Smuzhiyun	HH4_H		.req	d25
77*4882a593Smuzhiyun	HH34_L		.req	d26
78*4882a593Smuzhiyun	HH34_H		.req	d27
79*4882a593Smuzhiyun	SHASH2_H	.req	d29
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun	XL2		.req	q5
82*4882a593Smuzhiyun	XM2		.req	q6
83*4882a593Smuzhiyun	XH2		.req	q7
84*4882a593Smuzhiyun	T3		.req	q8
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun	XL2_L		.req	d10
87*4882a593Smuzhiyun	XL2_H		.req	d11
88*4882a593Smuzhiyun	XM2_L		.req	d12
89*4882a593Smuzhiyun	XM2_H		.req	d13
90*4882a593Smuzhiyun	T3_L		.req	d16
91*4882a593Smuzhiyun	T3_H		.req	d17
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun	.text
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
96*4882a593Smuzhiyun	vmull.p64	\rd, \rn, \rm
97*4882a593Smuzhiyun	.endm
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun	/*
100*4882a593Smuzhiyun	 * This implementation of 64x64 -> 128 bit polynomial multiplication
101*4882a593Smuzhiyun	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
102*4882a593Smuzhiyun	 * "Fast Software Polynomial Multiplication on ARM Processors Using
103*4882a593Smuzhiyun	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
104*4882a593Smuzhiyun	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
105*4882a593Smuzhiyun	 *
106*4882a593Smuzhiyun	 * It has been slightly tweaked for in-order performance, and to allow
107*4882a593Smuzhiyun	 * 'rq' to overlap with 'ad' or 'bd'.
108*4882a593Smuzhiyun	 */
109*4882a593Smuzhiyun	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
110*4882a593Smuzhiyun	vext.8		t0l, \ad, \ad, #1	@ A1
111*4882a593Smuzhiyun	.ifc		\b1, t4l
112*4882a593Smuzhiyun	vext.8		t4l, \bd, \bd, #1	@ B1
113*4882a593Smuzhiyun	.endif
114*4882a593Smuzhiyun	vmull.p8	t0q, t0l, \bd		@ F = A1*B
115*4882a593Smuzhiyun	vext.8		t1l, \ad, \ad, #2	@ A2
116*4882a593Smuzhiyun	vmull.p8	t4q, \ad, \b1		@ E = A*B1
117*4882a593Smuzhiyun	.ifc		\b2, t3l
118*4882a593Smuzhiyun	vext.8		t3l, \bd, \bd, #2	@ B2
119*4882a593Smuzhiyun	.endif
120*4882a593Smuzhiyun	vmull.p8	t1q, t1l, \bd		@ H = A2*B
121*4882a593Smuzhiyun	vext.8		t2l, \ad, \ad, #3	@ A3
122*4882a593Smuzhiyun	vmull.p8	t3q, \ad, \b2		@ G = A*B2
123*4882a593Smuzhiyun	veor		t0q, t0q, t4q		@ L = E + F
124*4882a593Smuzhiyun	.ifc		\b3, t4l
125*4882a593Smuzhiyun	vext.8		t4l, \bd, \bd, #3	@ B3
126*4882a593Smuzhiyun	.endif
127*4882a593Smuzhiyun	vmull.p8	t2q, t2l, \bd		@ J = A3*B
128*4882a593Smuzhiyun	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
129*4882a593Smuzhiyun	veor		t1q, t1q, t3q		@ M = G + H
130*4882a593Smuzhiyun	.ifc		\b4, t3l
131*4882a593Smuzhiyun	vext.8		t3l, \bd, \bd, #4	@ B4
132*4882a593Smuzhiyun	.endif
133*4882a593Smuzhiyun	vmull.p8	t4q, \ad, \b3		@ I = A*B3
134*4882a593Smuzhiyun	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
135*4882a593Smuzhiyun	vmull.p8	t3q, \ad, \b4		@ K = A*B4
136*4882a593Smuzhiyun	vand		t0h, t0h, k48
137*4882a593Smuzhiyun	vand		t1h, t1h, k32
138*4882a593Smuzhiyun	veor		t2q, t2q, t4q		@ N = I + J
139*4882a593Smuzhiyun	veor		t0l, t0l, t0h
140*4882a593Smuzhiyun	veor		t1l, t1l, t1h
141*4882a593Smuzhiyun	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
142*4882a593Smuzhiyun	vand		t2h, t2h, k16
143*4882a593Smuzhiyun	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
144*4882a593Smuzhiyun	vmov.i64	t3h, #0
145*4882a593Smuzhiyun	vext.8		t0q, t0q, t0q, #15
146*4882a593Smuzhiyun	veor		t2l, t2l, t2h
147*4882a593Smuzhiyun	vext.8		t1q, t1q, t1q, #14
148*4882a593Smuzhiyun	vmull.p8	\rq, \ad, \bd		@ D = A*B
149*4882a593Smuzhiyun	vext.8		t2q, t2q, t2q, #13
150*4882a593Smuzhiyun	vext.8		t3q, t3q, t3q, #12
151*4882a593Smuzhiyun	veor		t0q, t0q, t1q
152*4882a593Smuzhiyun	veor		t2q, t2q, t3q
153*4882a593Smuzhiyun	veor		\rq, \rq, t0q
154*4882a593Smuzhiyun	veor		\rq, \rq, t2q
155*4882a593Smuzhiyun	.endm
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun	//
158*4882a593Smuzhiyun	// PMULL (64x64->128) based reduction for CPUs that can do
159*4882a593Smuzhiyun	// it in a single instruction.
160*4882a593Smuzhiyun	//
161*4882a593Smuzhiyun	.macro		__pmull_reduce_p64
162*4882a593Smuzhiyun	vmull.p64	T1, XL_L, MASK
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun	veor		XH_L, XH_L, XM_H
165*4882a593Smuzhiyun	vext.8		T1, T1, T1, #8
166*4882a593Smuzhiyun	veor		XL_H, XL_H, XM_L
167*4882a593Smuzhiyun	veor		T1, T1, XL
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun	vmull.p64	XL, T1_H, MASK
170*4882a593Smuzhiyun	.endm
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun	//
173*4882a593Smuzhiyun	// Alternative reduction for CPUs that lack support for the
174*4882a593Smuzhiyun	// 64x64->128 PMULL instruction
175*4882a593Smuzhiyun	//
176*4882a593Smuzhiyun	.macro		__pmull_reduce_p8
177*4882a593Smuzhiyun	veor		XL_H, XL_H, XM_L
178*4882a593Smuzhiyun	veor		XH_L, XH_L, XM_H
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun	vshl.i64	T1, XL, #57
181*4882a593Smuzhiyun	vshl.i64	T2, XL, #62
182*4882a593Smuzhiyun	veor		T1, T1, T2
183*4882a593Smuzhiyun	vshl.i64	T2, XL, #63
184*4882a593Smuzhiyun	veor		T1, T1, T2
185*4882a593Smuzhiyun	veor		XL_H, XL_H, T1_L
186*4882a593Smuzhiyun	veor		XH_L, XH_L, T1_H
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	vshr.u64	T1, XL, #1
189*4882a593Smuzhiyun	veor		XH, XH, XL
190*4882a593Smuzhiyun	veor		XL, XL, T1
191*4882a593Smuzhiyun	vshr.u64	T1, T1, #6
192*4882a593Smuzhiyun	vshr.u64	XL, XL, #1
193*4882a593Smuzhiyun	.endm
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun	.macro		ghash_update, pn
196*4882a593Smuzhiyun	vld1.64		{XL}, [r1]
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun	/* do the head block first, if supplied */
199*4882a593Smuzhiyun	ldr		ip, [sp]
200*4882a593Smuzhiyun	teq		ip, #0
201*4882a593Smuzhiyun	beq		0f
202*4882a593Smuzhiyun	vld1.64		{T1}, [ip]
203*4882a593Smuzhiyun	teq		r0, #0
204*4882a593Smuzhiyun	b		3f
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun0:	.ifc		\pn, p64
207*4882a593Smuzhiyun	tst		r0, #3			// skip until #blocks is a
208*4882a593Smuzhiyun	bne		2f			// round multiple of 4
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun	vld1.8		{XL2-XM2}, [r2]!
211*4882a593Smuzhiyun1:	vld1.8		{T3-T2}, [r2]!
212*4882a593Smuzhiyun	vrev64.8	XL2, XL2
213*4882a593Smuzhiyun	vrev64.8	XM2, XM2
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun	subs		r0, r0, #4
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun	vext.8		T1, XL2, XL2, #8
218*4882a593Smuzhiyun	veor		XL2_H, XL2_H, XL_L
219*4882a593Smuzhiyun	veor		XL, XL, T1
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	vrev64.8	T3, T3
222*4882a593Smuzhiyun	vrev64.8	T1, T2
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
225*4882a593Smuzhiyun	veor		XL2_H, XL2_H, XL_H
226*4882a593Smuzhiyun	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
227*4882a593Smuzhiyun	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
230*4882a593Smuzhiyun	veor		XM2_L, XM2_L, XM2_H
231*4882a593Smuzhiyun	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
232*4882a593Smuzhiyun	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun	veor		XH, XH, XH2
235*4882a593Smuzhiyun	veor		XL, XL, XL2
236*4882a593Smuzhiyun	veor		XM, XM, XM2
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
239*4882a593Smuzhiyun	veor		T3_L, T3_L, T3_H
240*4882a593Smuzhiyun	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
241*4882a593Smuzhiyun	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun	veor		XH, XH, XH2
244*4882a593Smuzhiyun	veor		XL, XL, XL2
245*4882a593Smuzhiyun	veor		XM, XM, XM2
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
248*4882a593Smuzhiyun	veor		T1_L, T1_L, T1_H
249*4882a593Smuzhiyun	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
250*4882a593Smuzhiyun	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun	veor		XH, XH, XH2
253*4882a593Smuzhiyun	veor		XL, XL, XL2
254*4882a593Smuzhiyun	veor		XM, XM, XM2
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun	beq		4f
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun	vld1.8		{XL2-XM2}, [r2]!
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun	veor		T1, XL, XH
261*4882a593Smuzhiyun	veor		XM, XM, T1
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun	__pmull_reduce_p64
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun	veor		T1, T1, XH
266*4882a593Smuzhiyun	veor		XL, XL, T1
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun	b		1b
269*4882a593Smuzhiyun	.endif
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun2:	vld1.64		{T1}, [r2]!
272*4882a593Smuzhiyun	subs		r0, r0, #1
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun3:	/* multiply XL by SHASH in GF(2^128) */
275*4882a593Smuzhiyun#ifndef CONFIG_CPU_BIG_ENDIAN
276*4882a593Smuzhiyun	vrev64.8	T1, T1
277*4882a593Smuzhiyun#endif
278*4882a593Smuzhiyun	vext.8		IN1, T1, T1, #8
279*4882a593Smuzhiyun	veor		T1_L, T1_L, XL_H
280*4882a593Smuzhiyun	veor		XL, XL, IN1
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
283*4882a593Smuzhiyun	veor		T1, T1, XL
284*4882a593Smuzhiyun	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
285*4882a593Smuzhiyun	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun4:	veor		T1, XL, XH
288*4882a593Smuzhiyun	veor		XM, XM, T1
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun	__pmull_reduce_\pn
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun	veor		T1, T1, XH
293*4882a593Smuzhiyun	veor		XL, XL, T1
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun	bne		0b
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun	vst1.64		{XL}, [r1]
298*4882a593Smuzhiyun	bx		lr
299*4882a593Smuzhiyun	.endm
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun	/*
302*4882a593Smuzhiyun	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
303*4882a593Smuzhiyun	 *			   struct ghash_key const *k, const char *head)
304*4882a593Smuzhiyun	 */
305*4882a593SmuzhiyunENTRY(pmull_ghash_update_p64)
306*4882a593Smuzhiyun	vld1.64		{SHASH}, [r3]!
307*4882a593Smuzhiyun	vld1.64		{HH}, [r3]!
308*4882a593Smuzhiyun	vld1.64		{HH3-HH4}, [r3]
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun	veor		SHASH2_p64, SHASH_L, SHASH_H
311*4882a593Smuzhiyun	veor		SHASH2_H, HH_L, HH_H
312*4882a593Smuzhiyun	veor		HH34_L, HH3_L, HH3_H
313*4882a593Smuzhiyun	veor		HH34_H, HH4_L, HH4_H
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	vmov.i8		MASK, #0xe1
316*4882a593Smuzhiyun	vshl.u64	MASK, MASK, #57
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun	ghash_update	p64
319*4882a593SmuzhiyunENDPROC(pmull_ghash_update_p64)
320*4882a593Smuzhiyun
321*4882a593SmuzhiyunENTRY(pmull_ghash_update_p8)
322*4882a593Smuzhiyun	vld1.64		{SHASH}, [r3]
323*4882a593Smuzhiyun	veor		SHASH2_p8, SHASH_L, SHASH_H
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun	vext.8		s1l, SHASH_L, SHASH_L, #1
326*4882a593Smuzhiyun	vext.8		s2l, SHASH_L, SHASH_L, #2
327*4882a593Smuzhiyun	vext.8		s3l, SHASH_L, SHASH_L, #3
328*4882a593Smuzhiyun	vext.8		s4l, SHASH_L, SHASH_L, #4
329*4882a593Smuzhiyun	vext.8		s1h, SHASH_H, SHASH_H, #1
330*4882a593Smuzhiyun	vext.8		s2h, SHASH_H, SHASH_H, #2
331*4882a593Smuzhiyun	vext.8		s3h, SHASH_H, SHASH_H, #3
332*4882a593Smuzhiyun	vext.8		s4h, SHASH_H, SHASH_H, #4
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun	vmov.i64	k16, #0xffff
335*4882a593Smuzhiyun	vmov.i64	k32, #0xffffffff
336*4882a593Smuzhiyun	vmov.i64	k48, #0xffffffffffff
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun	ghash_update	p8
339*4882a593SmuzhiyunENDPROC(pmull_ghash_update_p8)
340