xref: /optee_os/core/arch/arm/crypto/aes_modes_armv8a_ce_a64.S (revision 2f41cd6f20781fe588ba7d166165414efa8aab63)
1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (c) 2015, 2020 Linaro Limited
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * - AES cipher for ARMv8 with Crypto Extensions
7 * - Chaining mode wrappers for AES
8 */
9
10#include <asm.S>
11
12	.arch		armv8-a+crypto
13
14	/* Preload all round keys */
15	.macro		load_round_keys, rounds, rk
16	cmp		\rounds, #12
17	blo		2222f		/* 128 bits */
18	beq		1111f		/* 192 bits */
19	ld1		{v17.16b-v18.16b}, [\rk], #32
201111:	ld1		{v19.16b-v20.16b}, [\rk], #32
212222:	ld1		{v21.16b-v24.16b}, [\rk], #64
22	ld1		{v25.16b-v28.16b}, [\rk], #64
23	ld1		{v29.16b-v31.16b}, [\rk]
24	.endm
25
26	/* Prepare for encryption with key in rk[] */
27	.macro		enc_prepare, rounds, rk, ignore
28	load_round_keys	\rounds, \rk
29	.endm
30
31	/* Prepare for encryption (again) but with new key in rk[] */
32	.macro		enc_switch_key, rounds, rk, ignore
33	load_round_keys	\rounds, \rk
34	.endm
35
36	/* Prepare for decryption with key in rk[] */
37	.macro		dec_prepare, rounds, rk, ignore
38	load_round_keys	\rounds, \rk
39	.endm
40
41	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3
42	aes\de		\i0\().16b, \k\().16b
43	aes\mc		\i0\().16b, \i0\().16b
44	.ifnb		\i1
45	aes\de		\i1\().16b, \k\().16b
46	aes\mc		\i1\().16b, \i1\().16b
47	.ifnb		\i3
48	aes\de		\i2\().16b, \k\().16b
49	aes\mc		\i2\().16b, \i2\().16b
50	aes\de		\i3\().16b, \k\().16b
51	aes\mc		\i3\().16b, \i3\().16b
52	.endif
53	.endif
54	.endm
55
56	/* Up to 4 interleaved encryption rounds with the same round key */
57	.macro		round_Nx, enc, k, i0, i1, i2, i3
58	.ifc		\enc, e
59	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3
60	.else
61	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3
62	.endif
63	.endm
64
65	/* Up to 4 interleaved final rounds */
66	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3
67	aes\de		\i0\().16b, \k\().16b
68	.ifnb		\i1
69	aes\de		\i1\().16b, \k\().16b
70	.ifnb		\i3
71	aes\de		\i2\().16b, \k\().16b
72	aes\de		\i3\().16b, \k\().16b
73	.endif
74	.endif
75	eor		\i0\().16b, \i0\().16b, \k2\().16b
76	.ifnb		\i1
77	eor		\i1\().16b, \i1\().16b, \k2\().16b
78	.ifnb		\i3
79	eor		\i2\().16b, \i2\().16b, \k2\().16b
80	eor		\i3\().16b, \i3\().16b, \k2\().16b
81	.endif
82	.endif
83	.endm
84
85	/* Up to 4 interleaved blocks */
86	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3
87	cmp		\rounds, #12
88	blo		2222f		/* 128 bits */
89	beq		1111f		/* 192 bits */
90	round_Nx	\enc, v17, \i0, \i1, \i2, \i3
91	round_Nx	\enc, v18, \i0, \i1, \i2, \i3
921111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3
93	round_Nx	\enc, v20, \i0, \i1, \i2, \i3
942222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
95	round_Nx	\enc, \key, \i0, \i1, \i2, \i3
96	.endr
97	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3
98	.endm
99
100	.macro		encrypt_block, in, rounds, t0, t1, t2
101	do_block_Nx	e, \rounds, \in
102	.endm
103
104	.macro		encrypt_block2x, i0, i1, rounds, t0, t1, t2
105	do_block_Nx	e, \rounds, \i0, \i1
106	.endm
107
108	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
109	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3
110	.endm
111
112	.macro		decrypt_block, in, rounds, t0, t1, t2
113	do_block_Nx	d, \rounds, \in
114	.endm
115
116	.macro		decrypt_block2x, i0, i1, rounds, t0, t1, t2
117	do_block_Nx	d, \rounds, \i0, \i1
118	.endm
119
120	.macro		decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
121	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3
122	.endm
123
124
125/*
126 * There are several ways to instantiate this code:
127 * - no interleave, all inline
128 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
129 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
130 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
131 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
132 *
133 * Macros imported by this code:
134 * - enc_prepare	- setup NEON registers for encryption
135 * - dec_prepare	- setup NEON registers for decryption
136 * - enc_switch_key	- change to new key after having prepared for encryption
137 * - encrypt_block	- encrypt a single block
138 * - decrypt block	- decrypt a single block
139 * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2)
140 * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2)
141 * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
142 * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
143 */
144
145#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
146#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
147#define FRAME_POP	ldp x29, x30, [sp],#16
148
149#if INTERLEAVE == 2
150
151LOCAL_FUNC aes_encrypt_block2x , :
152	encrypt_block2x	v0, v1, w3, x2, x6, w7
153	ret
154END_FUNC aes_encrypt_block2x
155
156LOCAL_FUNC aes_decrypt_block2x , :
157	decrypt_block2x	v0, v1, w3, x2, x6, w7
158	ret
159END_FUNC aes_decrypt_block2x
160
161#elif INTERLEAVE == 4
162
163LOCAL_FUNC aes_encrypt_block4x , :
164	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
165	ret
166END_FUNC aes_encrypt_block4x
167
168LOCAL_FUNC aes_decrypt_block4x , :
169	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
170	ret
171END_FUNC aes_decrypt_block4x
172
173#else
174#error INTERLEAVE should equal 2 or 4
175#endif
176
177	.macro		do_encrypt_block2x
178	bl		aes_encrypt_block2x
179	.endm
180
181	.macro		do_decrypt_block2x
182	bl		aes_decrypt_block2x
183	.endm
184
185	.macro		do_encrypt_block4x
186	bl		aes_encrypt_block4x
187	.endm
188
189	.macro		do_decrypt_block4x
190	bl		aes_decrypt_block4x
191	.endm
192
193#else
194#define FRAME_PUSH
195#define FRAME_POP
196
197	.macro		do_encrypt_block2x
198	encrypt_block2x	v0, v1, w3, x2, x6, w7
199	.endm
200
201	.macro		do_decrypt_block2x
202	decrypt_block2x	v0, v1, w3, x2, x6, w7
203	.endm
204
205	.macro		do_encrypt_block4x
206	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
207	.endm
208
209	.macro		do_decrypt_block4x
210	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
211	.endm
212
213#endif
214
215	/*
216	 * uint32_t ce_aes_sub(uint32_t in) - use the aese instruction to
217	 * perform the AES sbox substitution on each byte in 'input'
218	 */
219FUNC ce_aes_sub , :
220	dup		v1.4s, w0
221	movi		v0.16b, #0
222	aese		v0.16b, v1.16b
223	umov		w0, v0.s[0]
224	ret
225END_FUNC ce_aes_sub
226
227	/*
228	 * void ce_aes_invert(void *dst, const void *src);
229	 */
230FUNC ce_aes_invert , :
231	ld1		{v0.16b}, [x1]
232	aesimc		v1.16b, v0.16b
233	st1		{v1.16b}, [x0]
234	ret
235END_FUNC ce_aes_invert
236
237	/*
238	 * ce_aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
239	 *                    int rounds, int blocks, int first)
240	 */
241FUNC ce_aes_ecb_encrypt , :
242	FRAME_PUSH
243	cbz		w5, .LecbencloopNx
244
245	enc_prepare	w3, x2, x5
246
247.LecbencloopNx:
248#if INTERLEAVE >= 2
249	subs		w4, w4, #INTERLEAVE
250	bmi		.Lecbenc1x
251#if INTERLEAVE == 2
252	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
253	do_encrypt_block2x
254	st1		{v0.16b-v1.16b}, [x0], #32
255#else
256	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
257	do_encrypt_block4x
258	st1		{v0.16b-v3.16b}, [x0], #64
259#endif
260	b		.LecbencloopNx
261.Lecbenc1x:
262	adds		w4, w4, #INTERLEAVE
263	beq		.Lecbencout
264#endif
265.Lecbencloop:
266	ld1		{v0.16b}, [x1], #16		/* get next pt block */
267	encrypt_block	v0, w3, x2, x5, w6
268	st1		{v0.16b}, [x0], #16
269	subs		w4, w4, #1
270	bne		.Lecbencloop
271.Lecbencout:
272	FRAME_POP
273	ret
274END_FUNC ce_aes_ecb_encrypt
275
276	/*
277	 * ce_aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
278	 *                    int rounds, int blocks, int first)
279	 */
280FUNC ce_aes_ecb_decrypt , :
281	FRAME_PUSH
282	cbz		w5, .LecbdecloopNx
283
284	dec_prepare	w3, x2, x5
285
286.LecbdecloopNx:
287#if INTERLEAVE >= 2
288	subs		w4, w4, #INTERLEAVE
289	bmi		.Lecbdec1x
290#if INTERLEAVE == 2
291	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
292	do_decrypt_block2x
293	st1		{v0.16b-v1.16b}, [x0], #32
294#else
295	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
296	do_decrypt_block4x
297	st1		{v0.16b-v3.16b}, [x0], #64
298#endif
299	b		.LecbdecloopNx
300.Lecbdec1x:
301	adds		w4, w4, #INTERLEAVE
302	beq		.Lecbdecout
303#endif
304.Lecbdecloop:
305	ld1		{v0.16b}, [x1], #16		/* get next ct block */
306	decrypt_block	v0, w3, x2, x5, w6
307	st1		{v0.16b}, [x0], #16
308	subs		w4, w4, #1
309	bne		.Lecbdecloop
310.Lecbdecout:
311	FRAME_POP
312	ret
313END_FUNC ce_aes_ecb_decrypt
314
315	/*
316	 * void ce_aes_cbc_encrypt(uint8_t out[], uint8_t const in[],
317	 *			   uint8_t const rk[], int rounds, int blocks,
318	 *			   uint8_t iv[])
319	 */
320FUNC ce_aes_cbc_encrypt , :
321	ld1		{v4.16b}, [x5]			/* get iv */
322	enc_prepare	w3, x2, x6
323
324.Lcbcencloop4x:
325	subs		w4, w4, #4
326	bmi		.Lcbcenc1x
327	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
328	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
329	encrypt_block	v0, w3, x2, x6, w7
330	eor		v1.16b, v1.16b, v0.16b
331	encrypt_block	v1, w3, x2, x6, w7
332	eor		v2.16b, v2.16b, v1.16b
333	encrypt_block	v2, w3, x2, x6, w7
334	eor		v3.16b, v3.16b, v2.16b
335	encrypt_block	v3, w3, x2, x6, w7
336	st1		{v0.16b-v3.16b}, [x0], #64
337	mov		v4.16b, v3.16b
338	b		.Lcbcencloop4x
339.Lcbcenc1x:
340	adds		w4, w4, #4
341	beq		.Lcbcencout
342.Lcbcencloop:
343	ld1		{v0.16b}, [x1], #16		/* get next pt block */
344	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
345	encrypt_block	v4, w3, x2, x6, w7
346	st1		{v4.16b}, [x0], #16
347	subs		w4, w4, #1
348	bne		.Lcbcencloop
349.Lcbcencout:
350	st1		{v4.16b}, [x5]			/* return iv */
351	ret
352END_FUNC ce_aes_cbc_encrypt
353
354	/*
355	 * void ce_aes_cbc_decrypt(uint8_t out[], uint8_t const in[],
356	 *			   uint8_t const rk[], int rounds, int blocks,
357	 *			   uint8_t iv[])
358	 */
359FUNC ce_aes_cbc_decrypt , :
360	stp		x29, x30, [sp, #-16]!
361	mov		x29, sp
362
363	ld1		{v7.16b}, [x5]			/* get iv */
364	dec_prepare	w3, x2, x6
365
366.LcbcdecloopNx:
367	subs		w4, w4, #4
368	bmi		.Lcbcdec1x
369	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
370	mov		v4.16b, v0.16b
371	mov		v5.16b, v1.16b
372	mov		v6.16b, v2.16b
373	bl		aes_decrypt_block4x
374	sub		x1, x1, #16
375	eor		v0.16b, v0.16b, v7.16b
376	eor		v1.16b, v1.16b, v4.16b
377	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
378	eor		v2.16b, v2.16b, v5.16b
379	eor		v3.16b, v3.16b, v6.16b
380	st1		{v0.16b-v3.16b}, [x0], #64
381	b		.LcbcdecloopNx
382.Lcbcdec1x:
383	adds		w4, w4, #4
384	beq		.Lcbcdecout
385.Lcbcdecloop:
386	ld1		{v1.16b}, [x1], #16		/* get next ct block */
387	mov		v0.16b, v1.16b			/* ...and copy to v0 */
388	decrypt_block	v0, w3, x2, x6, w7
389	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
390	mov		v7.16b, v1.16b			/* ct is next iv */
391	st1		{v0.16b}, [x0], #16
392	subs		w4, w4, #1
393	bne		.Lcbcdecloop
394.Lcbcdecout:
395	st1		{v7.16b}, [x5]			/* return iv */
396	ldp		x29, x30, [sp], #16
397	ret
398END_FUNC ce_aes_cbc_decrypt
399
400
401	/*
402	 * void ce_aes_ctr_encrypt(uint8_t out[], uint8_t const in[],
403	 *			   uint8_t const rk[], int rounds, int blocks,
404	 *			   uint8_t ctr[], int first)
405	 */
406FUNC ce_aes_ctr_encrypt , :
407	stp             x29, x30, [sp, #-16]!
408	mov             x29, sp
409
410	enc_prepare     w3, x2, x6
411	ld1             {v4.16b}, [x5]
412
413	umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
414	rev             x6, x6
415	cmn             w6, w4                  /* 32 bit overflow? */
416	bcs             .Lctrloop
417.LctrloopNx:
418	subs            w4, w4, #4
419	bmi             .Lctr1x
420	add		w7, w6, #1
421	mov             v0.16b, v4.16b
422	add		w8, w6, #2
423	mov             v1.16b, v4.16b
424	add		w9, w6, #3
425	mov             v2.16b, v4.16b
426	rev		w7, w7
427	mov             v3.16b, v4.16b
428	rev		w8, w8
429	mov		v1.s[3], w7
430	rev		w9, w9
431	mov		v2.s[3], w8
432	mov		v3.s[3], w9
433	ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
434	bl              aes_encrypt_block4x
435	eor             v0.16b, v5.16b, v0.16b
436	ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
437	eor             v1.16b, v6.16b, v1.16b
438	eor             v2.16b, v7.16b, v2.16b
439	eor             v3.16b, v5.16b, v3.16b
440	st1             {v0.16b-v3.16b}, [x0], #64
441	add             x6, x6, #4
442	rev             x7, x6
443	ins             v4.d[1], x7
444	cbz             w4, .Lctrout
445	b               .LctrloopNx
446.Lctr1x:
447	adds            w4, w4, #4
448	beq             .Lctrout
449.Lctrloop:
450	mov             v0.16b, v4.16b
451	encrypt_block   v0, w3, x2, x8, w7
452
453	adds            x6, x6, #1              /* increment BE ctr */
454	rev             x7, x6
455	ins             v4.d[1], x7
456	bcs             .Lctrcarry              /* overflow? */
457
458.Lctrcarrydone:
459	subs            w4, w4, #1
460	bmi             .Lctrtailblock          /* blocks <0 means tail block */
461	ld1             {v3.16b}, [x1], #16
462	eor             v3.16b, v0.16b, v3.16b
463	st1             {v3.16b}, [x0], #16
464	bne             .Lctrloop
465
466.Lctrout:
467	st1             {v4.16b}, [x5]          /* return next CTR value */
468	ldp             x29, x30, [sp], #16
469	ret
470
471.Lctrtailblock:
472	st1             {v0.16b}, [x0]
473	ldp             x29, x30, [sp], #16
474	ret
475
476.Lctrcarry:
477	umov            x7, v4.d[0]             /* load upper word of ctr  */
478	rev             x7, x7                  /* ... to handle the carry */
479	add             x7, x7, #1
480	rev             x7, x7
481	ins             v4.d[0], x7
482	b               .Lctrcarrydone
483END_FUNC ce_aes_ctr_encrypt
484
485
486	.macro		next_tweak, out, in, const, tmp
487	sshr		\tmp\().2d,  \in\().2d,   #63
488	and		\tmp\().16b, \tmp\().16b, \const\().16b
489	add		\out\().2d,  \in\().2d,   \in\().2d
490	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
491	eor		\out\().16b, \out\().16b, \tmp\().16b
492	.endm
493
494	/*
495	 * void ce_aes_xts_encrypt(uint8_t out[], uint8_t const in[],
496	 *			   uint8_t const rk1[], int rounds, int blocks,
497	 *			   uint8_t const rk2[], uint8_t iv[])
498	 */
499FUNC ce_aes_xts_encrypt , :
500	FRAME_PUSH
501
502	ld1		{v4.16b}, [x6]
503	enc_prepare	w3, x5, x6
504	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
505	enc_switch_key	w3, x2, x6
506	ldr		q7, .Lxts_mul_x
507	b		.LxtsencNx
508
509.LxtsencloopNx:
510	next_tweak	v4, v4, v7, v8
511.LxtsencNx:
512#if INTERLEAVE >= 2
513	subs		w4, w4, #INTERLEAVE
514	bmi		.Lxtsenc1x
515#if INTERLEAVE == 2
516	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
517	next_tweak	v5, v4, v7, v8
518	eor		v0.16b, v0.16b, v4.16b
519	eor		v1.16b, v1.16b, v5.16b
520	do_encrypt_block2x
521	eor		v0.16b, v0.16b, v4.16b
522	eor		v1.16b, v1.16b, v5.16b
523	st1		{v0.16b-v1.16b}, [x0], #32
524	cbz		w4, .LxtsencoutNx
525	next_tweak	v4, v5, v7, v8
526	b		.LxtsencNx
527.LxtsencoutNx:
528	mov		v4.16b, v5.16b
529	b		.Lxtsencout
530#else
531	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
532	next_tweak	v5, v4, v7, v8
533	eor		v0.16b, v0.16b, v4.16b
534	next_tweak	v6, v5, v7, v8
535	eor		v1.16b, v1.16b, v5.16b
536	eor		v2.16b, v2.16b, v6.16b
537	next_tweak	v7, v6, v7, v8
538	eor		v3.16b, v3.16b, v7.16b
539	do_encrypt_block4x
540	eor		v3.16b, v3.16b, v7.16b
541	eor		v0.16b, v0.16b, v4.16b
542	eor		v1.16b, v1.16b, v5.16b
543	eor		v2.16b, v2.16b, v6.16b
544	st1		{v0.16b-v3.16b}, [x0], #64
545	mov		v4.16b, v7.16b
546	ldr		q7, .Lxts_mul_x
547	cbz		w4, .Lxtsencout
548	b		.LxtsencloopNx
549#endif
550.Lxtsenc1x:
551	adds		w4, w4, #INTERLEAVE
552	beq		.Lxtsencout
553#endif
554.Lxtsencloop:
555	ld1		{v1.16b}, [x1], #16
556	eor		v0.16b, v1.16b, v4.16b
557	encrypt_block	v0, w3, x2, x6, w7
558	eor		v0.16b, v0.16b, v4.16b
559	st1		{v0.16b}, [x0], #16
560	subs		w4, w4, #1
561	beq		.Lxtsencout
562	next_tweak	v4, v4, v7, v8
563	b		.Lxtsencloop
564.Lxtsencout:
565	next_tweak	v4, v4, v7, v8
566	st1		{v4.16b}, [x6], #16
567	FRAME_POP
568	ret
569
570	.align		4
571.Lxts_mul_x:
572	.word		1, 0, 0x87, 0
573END_FUNC ce_aes_xts_encrypt
574
575	/*
576	 * void ce_aes_xts_decrypt(uint8_t out[], uint8_t const in[],
577	 *			   uint8_t const rk1[], int rounds, int blocks,
578	 *			   uint8_t const rk2[], uint8_t iv[])
579	 */
580FUNC ce_aes_xts_decrypt , :
581	FRAME_PUSH
582
583	ld1		{v4.16b}, [x6]
584	enc_prepare	w3, x5, x6
585	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
586	dec_prepare	w3, x2, x6
587	ldr		q7, .Lxts_mul_x
588	b		.LxtsdecNx
589
590.LxtsdecloopNx:
591	next_tweak	v4, v4, v7, v8
592.LxtsdecNx:
593#if INTERLEAVE >= 2
594	subs		w4, w4, #INTERLEAVE
595	bmi		.Lxtsdec1x
596#if INTERLEAVE == 2
597	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
598	next_tweak	v5, v4, v7, v8
599	eor		v0.16b, v0.16b, v4.16b
600	eor		v1.16b, v1.16b, v5.16b
601	do_decrypt_block2x
602	eor		v0.16b, v0.16b, v4.16b
603	eor		v1.16b, v1.16b, v5.16b
604	st1		{v0.16b-v1.16b}, [x0], #32
605	cbz		w4, .LxtsdecoutNx
606	next_tweak	v4, v5, v7, v8
607	b		.LxtsdecNx
608.LxtsdecoutNx:
609	mov		v4.16b, v5.16b
610	b		.Lxtsdecout
611#else
612	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
613	next_tweak	v5, v4, v7, v8
614	eor		v0.16b, v0.16b, v4.16b
615	next_tweak	v6, v5, v7, v8
616	eor		v1.16b, v1.16b, v5.16b
617	eor		v2.16b, v2.16b, v6.16b
618	next_tweak	v7, v6, v7, v8
619	eor		v3.16b, v3.16b, v7.16b
620	do_decrypt_block4x
621	eor		v3.16b, v3.16b, v7.16b
622	eor		v0.16b, v0.16b, v4.16b
623	eor		v1.16b, v1.16b, v5.16b
624	eor		v2.16b, v2.16b, v6.16b
625	st1		{v0.16b-v3.16b}, [x0], #64
626	mov		v4.16b, v7.16b
627	ldr		q7, .Lxts_mul_x
628	cbz		w4, .Lxtsdecout
629	b		.LxtsdecloopNx
630#endif
631.Lxtsdec1x:
632	adds		w4, w4, #INTERLEAVE
633	beq		.Lxtsdecout
634#endif
635.Lxtsdecloop:
636	ld1		{v1.16b}, [x1], #16
637	eor		v0.16b, v1.16b, v4.16b
638	decrypt_block	v0, w3, x2, x6, w7
639	eor		v0.16b, v0.16b, v4.16b
640	st1		{v0.16b}, [x0], #16
641	subs		w4, w4, #1
642	beq		.Lxtsdecout
643	next_tweak	v4, v4, v7, v8
644	b		.Lxtsdecloop
645.Lxtsdecout:
646	FRAME_POP
647	next_tweak	v4, v4, v7, v8
648	st1		{v4.16b}, [x6], #16
649	ret
650END_FUNC ce_aes_xts_decrypt
651
652	/*
653	 * void ce_aes_xor_block(uint8_t out[], uint8_t const op1[],
654	 *			 uint8_t const op2[]);
655	 */
656FUNC ce_aes_xor_block , :
657	ld1	{v0.16b}, [x1]
658	ld1	{v1.16b}, [x2]
659	eor	v0.16b, v0.16b, v1.16b
660	st1	{v0.16b}, [x0]
661	ret
662END_FUNC ce_aes_xor_block
663
664BTI(emit_aarch64_feature_1_and     GNU_PROPERTY_AARCH64_FEATURE_1_BTI)
665