xref: /optee_os/core/arch/arm/crypto/ghash-ce-core_a64.S (revision fcabe15c7783f14d6997a89154a8754790c648ea)
1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (c) 2020, 2024 Linaro Limited
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
7 */
8
9#include <arm64_macros.S>
10#include <asm.S>
11#define CPU_LE(x...)	x
12
13/*
14 * Only increase the lowest quarter, that is, 32-bits of the counter. If
15 * it wraps it must not propagate into the upper bits.
16 */
17#define INC_QUART_CTR	1
18
19	SHASH		.req	v0
20	SHASH2		.req	v1
21	T1		.req	v2
22	T2		.req	v3
23	MASK		.req	v4
24	XL		.req	v5
25	XM		.req	v6
26	XH		.req	v7
27	IN1		.req	v7
28
29	k00_16		.req	v8
30	k32_48		.req	v9
31
32	t3		.req	v10
33	t4		.req	v11
34	t5		.req	v12
35	t6		.req	v13
36	t7		.req	v14
37	t8		.req	v15
38	t9		.req	v16
39
40	perm1		.req	v17
41	perm2		.req	v18
42	perm3		.req	v19
43
44	sh1		.req	v20
45	sh2		.req	v21
46	sh3		.req	v22
47	sh4		.req	v23
48
49	ss1		.req	v24
50	ss2		.req	v25
51	ss3		.req	v26
52	ss4		.req	v27
53
54	XL2		.req	v8
55	XM2		.req	v9
56	XH2		.req	v10
57	XL3		.req	v11
58	XM3		.req	v12
59	XH3		.req	v13
60	TT3		.req	v14
61	TT4		.req	v15
62	HH		.req	v16
63	HH3		.req	v17
64	HH4		.req	v18
65	HH34		.req	v19
66
67	.arch		armv8-a+crypto
68
69	.macro		__pmull_p64, rd, rn, rm
70	pmull		\rd\().1q, \rn\().1d, \rm\().1d
71	.endm
72
73	.macro		__pmull2_p64, rd, rn, rm
74	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
75	.endm
76
77	.macro		__pmull_p8, rq, ad, bd
78	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
79	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
80	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
81
82	__pmull_p8_\bd	\rq, \ad
83	.endm
84
85	.macro		__pmull2_p8, rq, ad, bd
86	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
87	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
88	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
89
90	__pmull2_p8_\bd	\rq, \ad
91	.endm
92
93	.macro		__pmull_p8_SHASH, rq, ad
94	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
95	.endm
96
97	.macro		__pmull_p8_SHASH2, rq, ad
98	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
99	.endm
100
101	.macro		__pmull2_p8_SHASH, rq, ad
102	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
103	.endm
104
105	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
106	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
107	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
108	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
109	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
110	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
111	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
112	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
113	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
114
115	eor		t3.16b, t3.16b, t4.16b			// L = E + F
116	eor		t5.16b, t5.16b, t6.16b			// M = G + H
117	eor		t7.16b, t7.16b, t8.16b			// N = I + J
118
119	uzp1		t4.2d, t3.2d, t5.2d
120	uzp2		t3.2d, t3.2d, t5.2d
121	uzp1		t6.2d, t7.2d, t9.2d
122	uzp2		t7.2d, t7.2d, t9.2d
123
124	// t3 = (L) (P0 + P1) << 8
125	// t5 = (M) (P2 + P3) << 16
126	eor		t4.16b, t4.16b, t3.16b
127	and		t3.16b, t3.16b, k32_48.16b
128
129	// t7 = (N) (P4 + P5) << 24
130	// t9 = (K) (P6 + P7) << 32
131	eor		t6.16b, t6.16b, t7.16b
132	and		t7.16b, t7.16b, k00_16.16b
133
134	eor		t4.16b, t4.16b, t3.16b
135	eor		t6.16b, t6.16b, t7.16b
136
137	zip2		t5.2d, t4.2d, t3.2d
138	zip1		t3.2d, t4.2d, t3.2d
139	zip2		t9.2d, t6.2d, t7.2d
140	zip1		t7.2d, t6.2d, t7.2d
141
142	ext		t3.16b, t3.16b, t3.16b, #15
143	ext		t5.16b, t5.16b, t5.16b, #14
144	ext		t7.16b, t7.16b, t7.16b, #13
145	ext		t9.16b, t9.16b, t9.16b, #12
146
147	eor		t3.16b, t3.16b, t5.16b
148	eor		t7.16b, t7.16b, t9.16b
149	eor		\rq\().16b, \rq\().16b, t3.16b
150	eor		\rq\().16b, \rq\().16b, t7.16b
151	.endm
152
153	.macro		__pmull_pre_p64
154	add		x8, x3, #16
155	ld1		{HH.2d-HH4.2d}, [x8]
156
157	trn1		SHASH2.2d, SHASH.2d, HH.2d
158	trn2		T1.2d, SHASH.2d, HH.2d
159	eor		SHASH2.16b, SHASH2.16b, T1.16b
160
161	trn1		HH34.2d, HH3.2d, HH4.2d
162	trn2		T1.2d, HH3.2d, HH4.2d
163	eor		HH34.16b, HH34.16b, T1.16b
164
165	movi		MASK.16b, #0xe1
166	shl		MASK.2d, MASK.2d, #57
167	.endm
168
169	.macro		__pmull_pre_p8
170	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
171	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
172
173	// k00_16 := 0x0000000000000000_000000000000ffff
174	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
175	movi		k32_48.2d, #0xffffffff
176	mov		k32_48.h[2], k32_48.h[0]
177	ushr		k00_16.2d, k32_48.2d, #32
178
179	// prepare the permutation vectors
180	mov_imm		x5, 0x080f0e0d0c0b0a09
181	movi		T1.8b, #8
182	dup		perm1.2d, x5
183	eor		perm1.16b, perm1.16b, T1.16b
184	ushr		perm2.2d, perm1.2d, #8
185	ushr		perm3.2d, perm1.2d, #16
186	ushr		T1.2d, perm1.2d, #24
187	sli		perm2.2d, perm1.2d, #56
188	sli		perm3.2d, perm1.2d, #48
189	sli		T1.2d, perm1.2d, #40
190
191	// precompute loop invariants
192	tbl		sh1.16b, {SHASH.16b}, perm1.16b
193	tbl		sh2.16b, {SHASH.16b}, perm2.16b
194	tbl		sh3.16b, {SHASH.16b}, perm3.16b
195	tbl		sh4.16b, {SHASH.16b}, T1.16b
196	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
197	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
198	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
199	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
200	.endm
201
202	//
203	// PMULL (64x64->128) based reduction for CPUs that can do
204	// it in a single instruction.
205	//
206	.macro		__pmull_reduce_p64
207	pmull		T2.1q, XL.1d, MASK.1d
208	eor		XM.16b, XM.16b, T1.16b
209
210	mov		XH.d[0], XM.d[1]
211	mov		XM.d[1], XL.d[0]
212
213	eor		XL.16b, XM.16b, T2.16b
214	ext		T2.16b, XL.16b, XL.16b, #8
215	pmull		XL.1q, XL.1d, MASK.1d
216	.endm
217
218	//
219	// Alternative reduction for CPUs that lack support for the
220	// 64x64->128 PMULL instruction
221	//
222	.macro		__pmull_reduce_p8
223	eor		XM.16b, XM.16b, T1.16b
224
225	mov		XL.d[1], XM.d[0]
226	mov		XH.d[0], XM.d[1]
227
228	shl		T1.2d, XL.2d, #57
229	shl		T2.2d, XL.2d, #62
230	eor		T2.16b, T2.16b, T1.16b
231	shl		T1.2d, XL.2d, #63
232	eor		T2.16b, T2.16b, T1.16b
233	ext		T1.16b, XL.16b, XH.16b, #8
234	eor		T2.16b, T2.16b, T1.16b
235
236	mov		XL.d[1], T2.d[0]
237	mov		XH.d[0], T2.d[1]
238
239	ushr		T2.2d, XL.2d, #1
240	eor		XH.16b, XH.16b, XL.16b
241	eor		XL.16b, XL.16b, T2.16b
242	ushr		T2.2d, T2.2d, #6
243	ushr		XL.2d, XL.2d, #1
244	.endm
245
246	.macro		__pmull_ghash, pn
247	ld1		{SHASH.2d}, [x3]
248	ld1		{XL.2d}, [x1]
249
250	__pmull_pre_\pn
251
252	/* do the head block first, if supplied */
253	cbz		x4, 0f
254	ld1		{T1.16b}, [x4]
255	mov		x4, xzr
256	b		3f
257
2580:	.ifc		\pn, p64
259	tbnz		w0, #0, 2f		// skip until #blocks is a
260	tbnz		w0, #1, 2f		// round multiple of 4
261
2621:	ld1		{XM3.16b-TT4.16b}, [x2], #64
263
264	sub		w0, w0, #4
265
266	rev64		T1.16b, XM3.16b
267	rev64		T2.16b, XH3.16b
268	rev64		TT4.16b, TT4.16b
269	rev64		TT3.16b, TT3.16b
270
271	ext		IN1.16b, TT4.16b, TT4.16b, #8
272	ext		XL3.16b, TT3.16b, TT3.16b, #8
273
274	eor		TT4.16b, TT4.16b, IN1.16b
275	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
276	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
277	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
278
279	eor		TT3.16b, TT3.16b, XL3.16b
280	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
281	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
282	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
283
284	ext		IN1.16b, T2.16b, T2.16b, #8
285	eor		XL2.16b, XL2.16b, XL3.16b
286	eor		XH2.16b, XH2.16b, XH3.16b
287	eor		XM2.16b, XM2.16b, XM3.16b
288
289	eor		T2.16b, T2.16b, IN1.16b
290	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
291	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
292	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
293
294	eor		XL2.16b, XL2.16b, XL3.16b
295	eor		XH2.16b, XH2.16b, XH3.16b
296	eor		XM2.16b, XM2.16b, XM3.16b
297
298	ext		IN1.16b, T1.16b, T1.16b, #8
299	ext		TT3.16b, XL.16b, XL.16b, #8
300	eor		XL.16b, XL.16b, IN1.16b
301	eor		T1.16b, T1.16b, TT3.16b
302
303	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
304	eor		T1.16b, T1.16b, XL.16b
305	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
306	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
307
308	eor		XL.16b, XL.16b, XL2.16b
309	eor		XH.16b, XH.16b, XH2.16b
310	eor		XM.16b, XM.16b, XM2.16b
311
312	eor		T2.16b, XL.16b, XH.16b
313	ext		T1.16b, XL.16b, XH.16b, #8
314	eor		XM.16b, XM.16b, T2.16b
315
316	__pmull_reduce_p64
317
318	eor		T2.16b, T2.16b, XH.16b
319	eor		XL.16b, XL.16b, T2.16b
320
321	cbz		w0, 5f
322	b		1b
323	.endif
324
3252:	ld1		{T1.16b}, [x2], #16
326	sub		w0, w0, #1
327
3283:	/* multiply XL by SHASH in GF(2^128) */
329CPU_LE(	rev64		T1.16b, T1.16b	)
330
331	ext		T2.16b, XL.16b, XL.16b, #8
332	ext		IN1.16b, T1.16b, T1.16b, #8
333	eor		T1.16b, T1.16b, T2.16b
334	eor		XL.16b, XL.16b, IN1.16b
335
336	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
337	eor		T1.16b, T1.16b, XL.16b
338	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
339	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
340
3414:	eor		T2.16b, XL.16b, XH.16b
342	ext		T1.16b, XL.16b, XH.16b, #8
343	eor		XM.16b, XM.16b, T2.16b
344
345	__pmull_reduce_\pn
346
347	eor		T2.16b, T2.16b, XH.16b
348	eor		XL.16b, XL.16b, T2.16b
349
350	cbnz		w0, 0b
351
3525:	st1		{XL.2d}, [x1]
353	ret
354	.endm
355
356/*
357 * void pmull_ghash_update_p64(int blocks, uint64_t dg[2], const uint8_t *src,
358 *			       const struct internal_ghash_key *ghash_key,
359 *			       const uint8_t *head);
360 */
361FUNC pmull_ghash_update_p64 , :
362	__pmull_ghash	p64
363END_FUNC pmull_ghash_update_p64
364
365/*
366 * void pmull_ghash_update_p8(int blocks, uint64_t dg[2], const uint8_t *src,
367 *			      const struct internal_ghash_key *ghash_key,
368 *			      const uint8_t *head);
369 */
370FUNC pmull_ghash_update_p8 , :
371	__pmull_ghash	p8
372END_FUNC pmull_ghash_update_p8
373
374	KS0		.req	v12
375	KS1		.req	v13
376	INP0		.req	v14
377	INP1		.req	v15
378
379	.macro		load_round_keys, rounds, rk
380	cmp		\rounds, #12
381	blo		2222f		/* 128 bits */
382	beq		1111f		/* 192 bits */
383	ld1		{v17.4s-v18.4s}, [\rk], #32
3841111:	ld1		{v19.4s-v20.4s}, [\rk], #32
3852222:	ld1		{v21.4s-v24.4s}, [\rk], #64
386	ld1		{v25.4s-v28.4s}, [\rk], #64
387	ld1		{v29.4s-v31.4s}, [\rk]
388	.endm
389
390	.macro		enc_round, state, key
391	aese		\state\().16b, \key\().16b
392	aesmc		\state\().16b, \state\().16b
393	.endm
394
395	.macro		enc_block, state, rounds
396	cmp		\rounds, #12
397	b.lo		2222f		/* 128 bits */
398	b.eq		1111f		/* 192 bits */
399	enc_round	\state, v17
400	enc_round	\state, v18
4011111:	enc_round	\state, v19
402	enc_round	\state, v20
4032222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
404	enc_round	\state, \key
405	.endr
406	aese		\state\().16b, v30.16b
407	eor		\state\().16b, \state\().16b, v31.16b
408	.endm
409
410	.macro		pmull_gcm_do_crypt, enc
411	ld1		{SHASH.2d}, [x4], #16
412	ld1		{HH.2d}, [x4]
413	ld1		{XL.2d}, [x1]
414#if INC_QUART_CTR
415	ldr		x8, [x5, #8]			// load lower counter
416#else
417	ldp		x9, x8, [x5]			// load counter
418#endif
419
420	movi		MASK.16b, #0xe1
421	trn1		SHASH2.2d, SHASH.2d, HH.2d
422	trn2		T1.2d, SHASH.2d, HH.2d
423CPU_LE(	rev		x8, x8		)
424#if !INC_QUART_CTR
425CPU_LE(	rev		x9, x9		)
426#endif
427	shl		MASK.2d, MASK.2d, #57
428	eor		SHASH2.16b, SHASH2.16b, T1.16b
429
430	.if		\enc == 1
431	ldr		x10, [sp]
432	ld1		{KS0.16b-KS1.16b}, [x10]
433	.endif
434
435	cbnz		x6, 4f
436
4370:	ld1		{INP0.16b-INP1.16b}, [x3], #32
438
439#if INC_QUART_CTR
440	lsr		x12, x8, #32		// Save the upper 32 bits
441	rev		x9, x8
442	add		w11, w8, #1
443	add		w8, w8, #2
444	add		x11, x11, x12, lsl #32	// Restore the upper 32 bits
445	add		x8, x8, x12, lsl #32
446#endif
447
448	.if		\enc == 1
449	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
450	eor		INP1.16b, INP1.16b, KS1.16b
451	.endif
452
453	sub		w0, w0, #2
454
455#if INC_QUART_CTR
456	ld1		{KS0.8b}, [x5]			// load upper counter
457	rev		x11, x11
458	mov		KS1.8b, KS0.8b
459	ins		KS0.d[1], x9			// set lower counter
460	ins		KS1.d[1], x11
461#else
462	ins		KS0.d[1], x8
463	ins		KS0.d[0], x9
464	rev64		KS0.16b, KS0.16b
465
466	add		x8, x8, #1
467	cbnz		x8, 10f
468	add		x9, x9, #1
46910:
470	ins		KS1.d[1], x8
471	ins		KS1.d[0], x9
472	rev64		KS1.16b, KS1.16b
473
474	add		x8, x8, #1
475	cbnz		x8, 11f
476	add		x9, x9, #1
47711:
478#endif
479
480	rev64		T1.16b, INP1.16b
481
482	cmp		w7, #12
483	b.ge		2f				// AES-192/256?
484
4851:	enc_round	KS0, v21
486	ext		IN1.16b, T1.16b, T1.16b, #8
487
488	enc_round	KS1, v21
489	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
490
491	enc_round	KS0, v22
492	eor		T1.16b, T1.16b, IN1.16b
493
494	enc_round	KS1, v22
495	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
496
497	enc_round	KS0, v23
498	pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0)
499
500	enc_round	KS1, v23
501	rev64		T1.16b, INP0.16b
502	ext		T2.16b, XL.16b, XL.16b, #8
503
504	enc_round	KS0, v24
505	ext		IN1.16b, T1.16b, T1.16b, #8
506	eor		T1.16b, T1.16b, T2.16b
507
508	enc_round	KS1, v24
509	eor		XL.16b, XL.16b, IN1.16b
510
511	enc_round	KS0, v25
512	eor		T1.16b, T1.16b, XL.16b
513
514	enc_round	KS1, v25
515	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
516
517	enc_round	KS0, v26
518	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
519
520	enc_round	KS1, v26
521	pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0)
522
523	enc_round	KS0, v27
524	eor		XL.16b, XL.16b, XL2.16b
525	eor		XH.16b, XH.16b, XH2.16b
526
527	enc_round	KS1, v27
528	eor		XM.16b, XM.16b, XM2.16b
529	ext		T1.16b, XL.16b, XH.16b, #8
530
531	enc_round	KS0, v28
532	eor		T2.16b, XL.16b, XH.16b
533	eor		XM.16b, XM.16b, T1.16b
534
535	enc_round	KS1, v28
536	eor		XM.16b, XM.16b, T2.16b
537
538	enc_round	KS0, v29
539	pmull		T2.1q, XL.1d, MASK.1d
540
541	enc_round	KS1, v29
542	mov		XH.d[0], XM.d[1]
543	mov		XM.d[1], XL.d[0]
544
545	aese		KS0.16b, v30.16b
546	eor		XL.16b, XM.16b, T2.16b
547
548	aese		KS1.16b, v30.16b
549	ext		T2.16b, XL.16b, XL.16b, #8
550
551	eor		KS0.16b, KS0.16b, v31.16b
552	pmull		XL.1q, XL.1d, MASK.1d
553	eor		T2.16b, T2.16b, XH.16b
554
555	eor		KS1.16b, KS1.16b, v31.16b
556	eor		XL.16b, XL.16b, T2.16b
557
558	.if		\enc == 0
559	eor		INP0.16b, INP0.16b, KS0.16b
560	eor		INP1.16b, INP1.16b, KS1.16b
561	.endif
562
563	st1		{INP0.16b-INP1.16b}, [x2], #32
564
565	cbnz		w0, 0b
566
567CPU_LE(	rev		x8, x8		)
568#if !INC_QUART_CTR
569CPU_LE(	rev		x9, x9		)
570#endif
571	st1		{XL.2d}, [x1]
572#if INC_QUART_CTR
573	str		x8, [x5, #8]			// store lower counter
574#else
575	stp		x9, x8, [x5]			// store counter
576#endif
577
578	.if		\enc == 1
579	st1		{KS0.16b-KS1.16b}, [x10]
580	.endif
581
582	ret
583
5842:	b.eq		3f				// AES-192?
585	enc_round	KS0, v17
586	enc_round	KS1, v17
587	enc_round	KS0, v18
588	enc_round	KS1, v18
5893:	enc_round	KS0, v19
590	enc_round	KS1, v19
591	enc_round	KS0, v20
592	enc_round	KS1, v20
593	b		1b
594
5954:	load_round_keys	w7, x6
596	b		0b
597	.endm
598
599/*
600 * void pmull_gcm_encrypt(int blocks, uint64_t dg[2], uint8_t dst[],
601 *			  const uint8_t src[],
602 *			  const struct internal_ghash_key *ghash_key,
603 *			  uint64_t ctr[], const uint64_t rk[], int rounds,
604 *			  uint8_t ks[]);
605 */
606FUNC pmull_gcm_encrypt , :
607	pmull_gcm_do_crypt	1
608END_FUNC pmull_gcm_encrypt
609
610/*
611 * void pmull_gcm_decrypt(int blocks, uint64_t dg[2], uint8_t dst[],
612 *			  const uint8_t src[],
613 *			  const struct internal_ghash_key *ghash_key,
614 *			  uint64_t ctr[], const uint64_t rk[], int rounds);
615 */
616FUNC pmull_gcm_decrypt , :
617	pmull_gcm_do_crypt	0
618END_FUNC pmull_gcm_decrypt
619
620/*
621 * void pmull_gcm_encrypt_block(uint8_t dst[], const uint8_t src[], int rounds)
622 */
623FUNC pmull_gcm_encrypt_block , :
624	ld1		{v0.16b}, [x1]
625	enc_block	v0, w2
626	st1		{v0.16b}, [x0]
627	ret
628END_FUNC pmull_gcm_encrypt_block
629
630/*
631 * void pmull_gcm_load_round_keys(const uint64_t rk[30], int rounds)
632 */
633FUNC pmull_gcm_load_round_keys , :
634	load_round_keys	w1, x0
635	ret
636END_FUNC pmull_gcm_load_round_keys
637
638/*
639 * uint32_t pmull_gcm_aes_sub(uint32_t input)
640 *
641 * use the aese instruction to perform the AES sbox substitution
642 * on each byte in 'input'
643 */
644FUNC pmull_gcm_aes_sub , :
645	dup	v1.4s, w0
646	movi	v0.16b, #0
647	aese	v0.16b, v1.16b
648	umov	w0, v0.s[0]
649	ret
650END_FUNC pmull_gcm_aes_sub
651
652BTI(emit_aarch64_feature_1_and     GNU_PROPERTY_AARCH64_FEATURE_1_BTI)
653