xref: /OK3568_Linux_fs/kernel/arch/arm64/crypto/aes-modes.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun/* included by aes-ce.S and aes-neon.S */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun	.text
11*4882a593Smuzhiyun	.align		4
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#ifndef MAX_STRIDE
14*4882a593Smuzhiyun#define MAX_STRIDE	4
15*4882a593Smuzhiyun#endif
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun#if MAX_STRIDE == 4
18*4882a593Smuzhiyun#define ST4(x...) x
19*4882a593Smuzhiyun#define ST5(x...)
20*4882a593Smuzhiyun#else
21*4882a593Smuzhiyun#define ST4(x...)
22*4882a593Smuzhiyun#define ST5(x...) x
23*4882a593Smuzhiyun#endif
24*4882a593Smuzhiyun
25*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26*4882a593Smuzhiyun	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27*4882a593Smuzhiyun	ret
28*4882a593SmuzhiyunSYM_FUNC_END(aes_encrypt_block4x)
29*4882a593Smuzhiyun
30*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31*4882a593Smuzhiyun	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32*4882a593Smuzhiyun	ret
33*4882a593SmuzhiyunSYM_FUNC_END(aes_decrypt_block4x)
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#if MAX_STRIDE == 5
36*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37*4882a593Smuzhiyun	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38*4882a593Smuzhiyun	ret
39*4882a593SmuzhiyunSYM_FUNC_END(aes_encrypt_block5x)
40*4882a593Smuzhiyun
41*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42*4882a593Smuzhiyun	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43*4882a593Smuzhiyun	ret
44*4882a593SmuzhiyunSYM_FUNC_END(aes_decrypt_block5x)
45*4882a593Smuzhiyun#endif
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun	/*
48*4882a593Smuzhiyun	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49*4882a593Smuzhiyun	 *		   int blocks)
50*4882a593Smuzhiyun	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51*4882a593Smuzhiyun	 *		   int blocks)
52*4882a593Smuzhiyun	 */
53*4882a593Smuzhiyun
54*4882a593SmuzhiyunAES_FUNC_START(aes_ecb_encrypt)
55*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
56*4882a593Smuzhiyun	mov		x29, sp
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun	enc_prepare	w3, x2, x5
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun.LecbencloopNx:
61*4882a593Smuzhiyun	subs		w4, w4, #MAX_STRIDE
62*4882a593Smuzhiyun	bmi		.Lecbenc1x
63*4882a593Smuzhiyun	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64*4882a593SmuzhiyunST4(	bl		aes_encrypt_block4x		)
65*4882a593SmuzhiyunST5(	ld1		{v4.16b}, [x1], #16		)
66*4882a593SmuzhiyunST5(	bl		aes_encrypt_block5x		)
67*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
68*4882a593SmuzhiyunST5(	st1		{v4.16b}, [x0], #16		)
69*4882a593Smuzhiyun	b		.LecbencloopNx
70*4882a593Smuzhiyun.Lecbenc1x:
71*4882a593Smuzhiyun	adds		w4, w4, #MAX_STRIDE
72*4882a593Smuzhiyun	beq		.Lecbencout
73*4882a593Smuzhiyun.Lecbencloop:
74*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75*4882a593Smuzhiyun	encrypt_block	v0, w3, x2, x5, w6
76*4882a593Smuzhiyun	st1		{v0.16b}, [x0], #16
77*4882a593Smuzhiyun	subs		w4, w4, #1
78*4882a593Smuzhiyun	bne		.Lecbencloop
79*4882a593Smuzhiyun.Lecbencout:
80*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
81*4882a593Smuzhiyun	ret
82*4882a593SmuzhiyunAES_FUNC_END(aes_ecb_encrypt)
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun
85*4882a593SmuzhiyunAES_FUNC_START(aes_ecb_decrypt)
86*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
87*4882a593Smuzhiyun	mov		x29, sp
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun	dec_prepare	w3, x2, x5
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun.LecbdecloopNx:
92*4882a593Smuzhiyun	subs		w4, w4, #MAX_STRIDE
93*4882a593Smuzhiyun	bmi		.Lecbdec1x
94*4882a593Smuzhiyun	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95*4882a593SmuzhiyunST4(	bl		aes_decrypt_block4x		)
96*4882a593SmuzhiyunST5(	ld1		{v4.16b}, [x1], #16		)
97*4882a593SmuzhiyunST5(	bl		aes_decrypt_block5x		)
98*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
99*4882a593SmuzhiyunST5(	st1		{v4.16b}, [x0], #16		)
100*4882a593Smuzhiyun	b		.LecbdecloopNx
101*4882a593Smuzhiyun.Lecbdec1x:
102*4882a593Smuzhiyun	adds		w4, w4, #MAX_STRIDE
103*4882a593Smuzhiyun	beq		.Lecbdecout
104*4882a593Smuzhiyun.Lecbdecloop:
105*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106*4882a593Smuzhiyun	decrypt_block	v0, w3, x2, x5, w6
107*4882a593Smuzhiyun	st1		{v0.16b}, [x0], #16
108*4882a593Smuzhiyun	subs		w4, w4, #1
109*4882a593Smuzhiyun	bne		.Lecbdecloop
110*4882a593Smuzhiyun.Lecbdecout:
111*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
112*4882a593Smuzhiyun	ret
113*4882a593SmuzhiyunAES_FUNC_END(aes_ecb_decrypt)
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun	/*
117*4882a593Smuzhiyun	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118*4882a593Smuzhiyun	 *		   int blocks, u8 iv[])
119*4882a593Smuzhiyun	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120*4882a593Smuzhiyun	 *		   int blocks, u8 iv[])
121*4882a593Smuzhiyun	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122*4882a593Smuzhiyun	 *			 int rounds, int blocks, u8 iv[],
123*4882a593Smuzhiyun	 *			 u32 const rk2[]);
124*4882a593Smuzhiyun	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125*4882a593Smuzhiyun	 *			 int rounds, int blocks, u8 iv[],
126*4882a593Smuzhiyun	 *			 u32 const rk2[]);
127*4882a593Smuzhiyun	 */
128*4882a593Smuzhiyun
129*4882a593SmuzhiyunAES_FUNC_START(aes_essiv_cbc_encrypt)
130*4882a593Smuzhiyun	ld1		{v4.16b}, [x5]			/* get iv */
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	mov		w8, #14				/* AES-256: 14 rounds */
133*4882a593Smuzhiyun	enc_prepare	w8, x6, x7
134*4882a593Smuzhiyun	encrypt_block	v4, w8, x6, x7, w9
135*4882a593Smuzhiyun	enc_switch_key	w3, x2, x6
136*4882a593Smuzhiyun	b		.Lcbcencloop4x
137*4882a593Smuzhiyun
138*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_encrypt)
139*4882a593Smuzhiyun	ld1		{v4.16b}, [x5]			/* get iv */
140*4882a593Smuzhiyun	enc_prepare	w3, x2, x6
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun.Lcbcencloop4x:
143*4882a593Smuzhiyun	subs		w4, w4, #4
144*4882a593Smuzhiyun	bmi		.Lcbcenc1x
145*4882a593Smuzhiyun	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147*4882a593Smuzhiyun	encrypt_block	v0, w3, x2, x6, w7
148*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v0.16b
149*4882a593Smuzhiyun	encrypt_block	v1, w3, x2, x6, w7
150*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v1.16b
151*4882a593Smuzhiyun	encrypt_block	v2, w3, x2, x6, w7
152*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v2.16b
153*4882a593Smuzhiyun	encrypt_block	v3, w3, x2, x6, w7
154*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
155*4882a593Smuzhiyun	mov		v4.16b, v3.16b
156*4882a593Smuzhiyun	b		.Lcbcencloop4x
157*4882a593Smuzhiyun.Lcbcenc1x:
158*4882a593Smuzhiyun	adds		w4, w4, #4
159*4882a593Smuzhiyun	beq		.Lcbcencout
160*4882a593Smuzhiyun.Lcbcencloop:
161*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163*4882a593Smuzhiyun	encrypt_block	v4, w3, x2, x6, w7
164*4882a593Smuzhiyun	st1		{v4.16b}, [x0], #16
165*4882a593Smuzhiyun	subs		w4, w4, #1
166*4882a593Smuzhiyun	bne		.Lcbcencloop
167*4882a593Smuzhiyun.Lcbcencout:
168*4882a593Smuzhiyun	st1		{v4.16b}, [x5]			/* return iv */
169*4882a593Smuzhiyun	ret
170*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_encrypt)
171*4882a593SmuzhiyunAES_FUNC_END(aes_essiv_cbc_encrypt)
172*4882a593Smuzhiyun
173*4882a593SmuzhiyunAES_FUNC_START(aes_essiv_cbc_decrypt)
174*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
175*4882a593Smuzhiyun	mov		x29, sp
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun	ld1		{cbciv.16b}, [x5]		/* get iv */
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun	mov		w8, #14				/* AES-256: 14 rounds */
180*4882a593Smuzhiyun	enc_prepare	w8, x6, x7
181*4882a593Smuzhiyun	encrypt_block	cbciv, w8, x6, x7, w9
182*4882a593Smuzhiyun	b		.Lessivcbcdecstart
183*4882a593Smuzhiyun
184*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_decrypt)
185*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
186*4882a593Smuzhiyun	mov		x29, sp
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	ld1		{cbciv.16b}, [x5]		/* get iv */
189*4882a593Smuzhiyun.Lessivcbcdecstart:
190*4882a593Smuzhiyun	dec_prepare	w3, x2, x6
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun.LcbcdecloopNx:
193*4882a593Smuzhiyun	subs		w4, w4, #MAX_STRIDE
194*4882a593Smuzhiyun	bmi		.Lcbcdec1x
195*4882a593Smuzhiyun	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196*4882a593Smuzhiyun#if MAX_STRIDE == 5
197*4882a593Smuzhiyun	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198*4882a593Smuzhiyun	mov		v5.16b, v0.16b
199*4882a593Smuzhiyun	mov		v6.16b, v1.16b
200*4882a593Smuzhiyun	mov		v7.16b, v2.16b
201*4882a593Smuzhiyun	bl		aes_decrypt_block5x
202*4882a593Smuzhiyun	sub		x1, x1, #32
203*4882a593Smuzhiyun	eor		v0.16b, v0.16b, cbciv.16b
204*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v5.16b
205*4882a593Smuzhiyun	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206*4882a593Smuzhiyun	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v6.16b
208*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v7.16b
209*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v5.16b
210*4882a593Smuzhiyun#else
211*4882a593Smuzhiyun	mov		v4.16b, v0.16b
212*4882a593Smuzhiyun	mov		v5.16b, v1.16b
213*4882a593Smuzhiyun	mov		v6.16b, v2.16b
214*4882a593Smuzhiyun	bl		aes_decrypt_block4x
215*4882a593Smuzhiyun	sub		x1, x1, #16
216*4882a593Smuzhiyun	eor		v0.16b, v0.16b, cbciv.16b
217*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v4.16b
218*4882a593Smuzhiyun	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v5.16b
220*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v6.16b
221*4882a593Smuzhiyun#endif
222*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
223*4882a593SmuzhiyunST5(	st1		{v4.16b}, [x0], #16		)
224*4882a593Smuzhiyun	b		.LcbcdecloopNx
225*4882a593Smuzhiyun.Lcbcdec1x:
226*4882a593Smuzhiyun	adds		w4, w4, #MAX_STRIDE
227*4882a593Smuzhiyun	beq		.Lcbcdecout
228*4882a593Smuzhiyun.Lcbcdecloop:
229*4882a593Smuzhiyun	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230*4882a593Smuzhiyun	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231*4882a593Smuzhiyun	decrypt_block	v0, w3, x2, x6, w7
232*4882a593Smuzhiyun	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233*4882a593Smuzhiyun	mov		cbciv.16b, v1.16b		/* ct is next iv */
234*4882a593Smuzhiyun	st1		{v0.16b}, [x0], #16
235*4882a593Smuzhiyun	subs		w4, w4, #1
236*4882a593Smuzhiyun	bne		.Lcbcdecloop
237*4882a593Smuzhiyun.Lcbcdecout:
238*4882a593Smuzhiyun	st1		{cbciv.16b}, [x5]		/* return iv */
239*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
240*4882a593Smuzhiyun	ret
241*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_decrypt)
242*4882a593SmuzhiyunAES_FUNC_END(aes_essiv_cbc_decrypt)
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	/*
246*4882a593Smuzhiyun	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247*4882a593Smuzhiyun	 *		       int rounds, int bytes, u8 const iv[])
248*4882a593Smuzhiyun	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249*4882a593Smuzhiyun	 *		       int rounds, int bytes, u8 const iv[])
250*4882a593Smuzhiyun	 */
251*4882a593Smuzhiyun
252*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_cts_encrypt)
253*4882a593Smuzhiyun	adr_l		x8, .Lcts_permute_table
254*4882a593Smuzhiyun	sub		x4, x4, #16
255*4882a593Smuzhiyun	add		x9, x8, #32
256*4882a593Smuzhiyun	add		x8, x8, x4
257*4882a593Smuzhiyun	sub		x9, x9, x4
258*4882a593Smuzhiyun	ld1		{v3.16b}, [x8]
259*4882a593Smuzhiyun	ld1		{v4.16b}, [x9]
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262*4882a593Smuzhiyun	ld1		{v1.16b}, [x1]
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	ld1		{v5.16b}, [x5]			/* get iv */
265*4882a593Smuzhiyun	enc_prepare	w3, x2, x6
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268*4882a593Smuzhiyun	tbl		v1.16b, {v1.16b}, v4.16b
269*4882a593Smuzhiyun	encrypt_block	v0, w3, x2, x6, w7
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v0.16b
272*4882a593Smuzhiyun	tbl		v0.16b, {v0.16b}, v3.16b
273*4882a593Smuzhiyun	encrypt_block	v1, w3, x2, x6, w7
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun	add		x4, x0, x4
276*4882a593Smuzhiyun	st1		{v0.16b}, [x4]			/* overlapping stores */
277*4882a593Smuzhiyun	st1		{v1.16b}, [x0]
278*4882a593Smuzhiyun	ret
279*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_cts_encrypt)
280*4882a593Smuzhiyun
281*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_cts_decrypt)
282*4882a593Smuzhiyun	adr_l		x8, .Lcts_permute_table
283*4882a593Smuzhiyun	sub		x4, x4, #16
284*4882a593Smuzhiyun	add		x9, x8, #32
285*4882a593Smuzhiyun	add		x8, x8, x4
286*4882a593Smuzhiyun	sub		x9, x9, x4
287*4882a593Smuzhiyun	ld1		{v3.16b}, [x8]
288*4882a593Smuzhiyun	ld1		{v4.16b}, [x9]
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291*4882a593Smuzhiyun	ld1		{v1.16b}, [x1]
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun	ld1		{v5.16b}, [x5]			/* get iv */
294*4882a593Smuzhiyun	dec_prepare	w3, x2, x6
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	decrypt_block	v0, w3, x2, x6, w7
297*4882a593Smuzhiyun	tbl		v2.16b, {v0.16b}, v3.16b
298*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v1.16b
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun	tbx		v0.16b, {v1.16b}, v4.16b
301*4882a593Smuzhiyun	decrypt_block	v0, w3, x2, x6, w7
302*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun	add		x4, x0, x4
305*4882a593Smuzhiyun	st1		{v2.16b}, [x4]			/* overlapping stores */
306*4882a593Smuzhiyun	st1		{v0.16b}, [x0]
307*4882a593Smuzhiyun	ret
308*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_cts_decrypt)
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun	.section	".rodata", "a"
311*4882a593Smuzhiyun	.align		6
312*4882a593Smuzhiyun.Lcts_permute_table:
313*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315*4882a593Smuzhiyun	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316*4882a593Smuzhiyun	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319*4882a593Smuzhiyun	.previous
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun	/*
323*4882a593Smuzhiyun	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324*4882a593Smuzhiyun	 *		   int blocks, u8 ctr[])
325*4882a593Smuzhiyun	 */
326*4882a593Smuzhiyun
327*4882a593SmuzhiyunAES_FUNC_START(aes_ctr_encrypt)
328*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
329*4882a593Smuzhiyun	mov		x29, sp
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun	enc_prepare	w3, x2, x6
332*4882a593Smuzhiyun	ld1		{vctr.16b}, [x5]
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun	umov		x6, vctr.d[1]		/* keep swabbed ctr in reg */
335*4882a593Smuzhiyun	rev		x6, x6
336*4882a593Smuzhiyun	cmn		w6, w4			/* 32 bit overflow? */
337*4882a593Smuzhiyun	bcs		.Lctrloop
338*4882a593Smuzhiyun.LctrloopNx:
339*4882a593Smuzhiyun	subs		w4, w4, #MAX_STRIDE
340*4882a593Smuzhiyun	bmi		.Lctr1x
341*4882a593Smuzhiyun	add		w7, w6, #1
342*4882a593Smuzhiyun	mov		v0.16b, vctr.16b
343*4882a593Smuzhiyun	add		w8, w6, #2
344*4882a593Smuzhiyun	mov		v1.16b, vctr.16b
345*4882a593Smuzhiyun	add		w9, w6, #3
346*4882a593Smuzhiyun	mov		v2.16b, vctr.16b
347*4882a593Smuzhiyun	add		w9, w6, #3
348*4882a593Smuzhiyun	rev		w7, w7
349*4882a593Smuzhiyun	mov		v3.16b, vctr.16b
350*4882a593Smuzhiyun	rev		w8, w8
351*4882a593SmuzhiyunST5(	mov		v4.16b, vctr.16b		)
352*4882a593Smuzhiyun	mov		v1.s[3], w7
353*4882a593Smuzhiyun	rev		w9, w9
354*4882a593SmuzhiyunST5(	add		w10, w6, #4			)
355*4882a593Smuzhiyun	mov		v2.s[3], w8
356*4882a593SmuzhiyunST5(	rev		w10, w10			)
357*4882a593Smuzhiyun	mov		v3.s[3], w9
358*4882a593SmuzhiyunST5(	mov		v4.s[3], w10			)
359*4882a593Smuzhiyun	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
360*4882a593SmuzhiyunST4(	bl		aes_encrypt_block4x		)
361*4882a593SmuzhiyunST5(	bl		aes_encrypt_block5x		)
362*4882a593Smuzhiyun	eor		v0.16b, v5.16b, v0.16b
363*4882a593SmuzhiyunST4(	ld1		{v5.16b}, [x1], #16		)
364*4882a593Smuzhiyun	eor		v1.16b, v6.16b, v1.16b
365*4882a593SmuzhiyunST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
366*4882a593Smuzhiyun	eor		v2.16b, v7.16b, v2.16b
367*4882a593Smuzhiyun	eor		v3.16b, v5.16b, v3.16b
368*4882a593SmuzhiyunST5(	eor		v4.16b, v6.16b, v4.16b		)
369*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
370*4882a593SmuzhiyunST5(	st1		{v4.16b}, [x0], #16		)
371*4882a593Smuzhiyun	add		x6, x6, #MAX_STRIDE
372*4882a593Smuzhiyun	rev		x7, x6
373*4882a593Smuzhiyun	ins		vctr.d[1], x7
374*4882a593Smuzhiyun	cbz		w4, .Lctrout
375*4882a593Smuzhiyun	b		.LctrloopNx
376*4882a593Smuzhiyun.Lctr1x:
377*4882a593Smuzhiyun	adds		w4, w4, #MAX_STRIDE
378*4882a593Smuzhiyun	beq		.Lctrout
379*4882a593Smuzhiyun.Lctrloop:
380*4882a593Smuzhiyun	mov		v0.16b, vctr.16b
381*4882a593Smuzhiyun	encrypt_block	v0, w3, x2, x8, w7
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun	adds		x6, x6, #1		/* increment BE ctr */
384*4882a593Smuzhiyun	rev		x7, x6
385*4882a593Smuzhiyun	ins		vctr.d[1], x7
386*4882a593Smuzhiyun	bcs		.Lctrcarry		/* overflow? */
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun.Lctrcarrydone:
389*4882a593Smuzhiyun	subs		w4, w4, #1
390*4882a593Smuzhiyun	bmi		.Lctrtailblock		/* blocks <0 means tail block */
391*4882a593Smuzhiyun	ld1		{v3.16b}, [x1], #16
392*4882a593Smuzhiyun	eor		v3.16b, v0.16b, v3.16b
393*4882a593Smuzhiyun	st1		{v3.16b}, [x0], #16
394*4882a593Smuzhiyun	bne		.Lctrloop
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun.Lctrout:
397*4882a593Smuzhiyun	st1		{vctr.16b}, [x5]	/* return next CTR value */
398*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
399*4882a593Smuzhiyun	ret
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun.Lctrtailblock:
402*4882a593Smuzhiyun	st1		{v0.16b}, [x0]
403*4882a593Smuzhiyun	b		.Lctrout
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun.Lctrcarry:
406*4882a593Smuzhiyun	umov		x7, vctr.d[0]		/* load upper word of ctr  */
407*4882a593Smuzhiyun	rev		x7, x7			/* ... to handle the carry */
408*4882a593Smuzhiyun	add		x7, x7, #1
409*4882a593Smuzhiyun	rev		x7, x7
410*4882a593Smuzhiyun	ins		vctr.d[0], x7
411*4882a593Smuzhiyun	b		.Lctrcarrydone
412*4882a593SmuzhiyunAES_FUNC_END(aes_ctr_encrypt)
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun	/*
416*4882a593Smuzhiyun	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
417*4882a593Smuzhiyun	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
418*4882a593Smuzhiyun	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
419*4882a593Smuzhiyun	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
420*4882a593Smuzhiyun	 */
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun	.macro		next_tweak, out, in, tmp
423*4882a593Smuzhiyun	sshr		\tmp\().2d,  \in\().2d,   #63
424*4882a593Smuzhiyun	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
425*4882a593Smuzhiyun	add		\out\().2d,  \in\().2d,   \in\().2d
426*4882a593Smuzhiyun	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
427*4882a593Smuzhiyun	eor		\out\().16b, \out\().16b, \tmp\().16b
428*4882a593Smuzhiyun	.endm
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun	.macro		xts_load_mask, tmp
431*4882a593Smuzhiyun	movi		xtsmask.2s, #0x1
432*4882a593Smuzhiyun	movi		\tmp\().2s, #0x87
433*4882a593Smuzhiyun	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
434*4882a593Smuzhiyun	.endm
435*4882a593Smuzhiyun
436*4882a593SmuzhiyunAES_FUNC_START(aes_xts_encrypt)
437*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
438*4882a593Smuzhiyun	mov		x29, sp
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun	ld1		{v4.16b}, [x6]
441*4882a593Smuzhiyun	xts_load_mask	v8
442*4882a593Smuzhiyun	cbz		w7, .Lxtsencnotfirst
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun	enc_prepare	w3, x5, x8
445*4882a593Smuzhiyun	xts_cts_skip_tw	w7, .LxtsencNx
446*4882a593Smuzhiyun	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
447*4882a593Smuzhiyun	enc_switch_key	w3, x2, x8
448*4882a593Smuzhiyun	b		.LxtsencNx
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun.Lxtsencnotfirst:
451*4882a593Smuzhiyun	enc_prepare	w3, x2, x8
452*4882a593Smuzhiyun.LxtsencloopNx:
453*4882a593Smuzhiyun	next_tweak	v4, v4, v8
454*4882a593Smuzhiyun.LxtsencNx:
455*4882a593Smuzhiyun	subs		w4, w4, #64
456*4882a593Smuzhiyun	bmi		.Lxtsenc1x
457*4882a593Smuzhiyun	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
458*4882a593Smuzhiyun	next_tweak	v5, v4, v8
459*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
460*4882a593Smuzhiyun	next_tweak	v6, v5, v8
461*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v5.16b
462*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v6.16b
463*4882a593Smuzhiyun	next_tweak	v7, v6, v8
464*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v7.16b
465*4882a593Smuzhiyun	bl		aes_encrypt_block4x
466*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v7.16b
467*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
468*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v5.16b
469*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v6.16b
470*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
471*4882a593Smuzhiyun	mov		v4.16b, v7.16b
472*4882a593Smuzhiyun	cbz		w4, .Lxtsencret
473*4882a593Smuzhiyun	xts_reload_mask	v8
474*4882a593Smuzhiyun	b		.LxtsencloopNx
475*4882a593Smuzhiyun.Lxtsenc1x:
476*4882a593Smuzhiyun	adds		w4, w4, #64
477*4882a593Smuzhiyun	beq		.Lxtsencout
478*4882a593Smuzhiyun	subs		w4, w4, #16
479*4882a593Smuzhiyun	bmi		.LxtsencctsNx
480*4882a593Smuzhiyun.Lxtsencloop:
481*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], #16
482*4882a593Smuzhiyun.Lxtsencctsout:
483*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
484*4882a593Smuzhiyun	encrypt_block	v0, w3, x2, x8, w7
485*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
486*4882a593Smuzhiyun	cbz		w4, .Lxtsencout
487*4882a593Smuzhiyun	subs		w4, w4, #16
488*4882a593Smuzhiyun	next_tweak	v4, v4, v8
489*4882a593Smuzhiyun	bmi		.Lxtsenccts
490*4882a593Smuzhiyun	st1		{v0.16b}, [x0], #16
491*4882a593Smuzhiyun	b		.Lxtsencloop
492*4882a593Smuzhiyun.Lxtsencout:
493*4882a593Smuzhiyun	st1		{v0.16b}, [x0]
494*4882a593Smuzhiyun.Lxtsencret:
495*4882a593Smuzhiyun	st1		{v4.16b}, [x6]
496*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
497*4882a593Smuzhiyun	ret
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun.LxtsencctsNx:
500*4882a593Smuzhiyun	mov		v0.16b, v3.16b
501*4882a593Smuzhiyun	sub		x0, x0, #16
502*4882a593Smuzhiyun.Lxtsenccts:
503*4882a593Smuzhiyun	adr_l		x8, .Lcts_permute_table
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun	add		x1, x1, w4, sxtw	/* rewind input pointer */
506*4882a593Smuzhiyun	add		w4, w4, #16		/* # bytes in final block */
507*4882a593Smuzhiyun	add		x9, x8, #32
508*4882a593Smuzhiyun	add		x8, x8, x4
509*4882a593Smuzhiyun	sub		x9, x9, x4
510*4882a593Smuzhiyun	add		x4, x0, x4		/* output address of final block */
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun	ld1		{v1.16b}, [x1]		/* load final block */
513*4882a593Smuzhiyun	ld1		{v2.16b}, [x8]
514*4882a593Smuzhiyun	ld1		{v3.16b}, [x9]
515*4882a593Smuzhiyun
516*4882a593Smuzhiyun	tbl		v2.16b, {v0.16b}, v2.16b
517*4882a593Smuzhiyun	tbx		v0.16b, {v1.16b}, v3.16b
518*4882a593Smuzhiyun	st1		{v2.16b}, [x4]			/* overlapping stores */
519*4882a593Smuzhiyun	mov		w4, wzr
520*4882a593Smuzhiyun	b		.Lxtsencctsout
521*4882a593SmuzhiyunAES_FUNC_END(aes_xts_encrypt)
522*4882a593Smuzhiyun
523*4882a593SmuzhiyunAES_FUNC_START(aes_xts_decrypt)
524*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
525*4882a593Smuzhiyun	mov		x29, sp
526*4882a593Smuzhiyun
527*4882a593Smuzhiyun	/* subtract 16 bytes if we are doing CTS */
528*4882a593Smuzhiyun	sub		w8, w4, #0x10
529*4882a593Smuzhiyun	tst		w4, #0xf
530*4882a593Smuzhiyun	csel		w4, w4, w8, eq
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun	ld1		{v4.16b}, [x6]
533*4882a593Smuzhiyun	xts_load_mask	v8
534*4882a593Smuzhiyun	xts_cts_skip_tw	w7, .Lxtsdecskiptw
535*4882a593Smuzhiyun	cbz		w7, .Lxtsdecnotfirst
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun	enc_prepare	w3, x5, x8
538*4882a593Smuzhiyun	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
539*4882a593Smuzhiyun.Lxtsdecskiptw:
540*4882a593Smuzhiyun	dec_prepare	w3, x2, x8
541*4882a593Smuzhiyun	b		.LxtsdecNx
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun.Lxtsdecnotfirst:
544*4882a593Smuzhiyun	dec_prepare	w3, x2, x8
545*4882a593Smuzhiyun.LxtsdecloopNx:
546*4882a593Smuzhiyun	next_tweak	v4, v4, v8
547*4882a593Smuzhiyun.LxtsdecNx:
548*4882a593Smuzhiyun	subs		w4, w4, #64
549*4882a593Smuzhiyun	bmi		.Lxtsdec1x
550*4882a593Smuzhiyun	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
551*4882a593Smuzhiyun	next_tweak	v5, v4, v8
552*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
553*4882a593Smuzhiyun	next_tweak	v6, v5, v8
554*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v5.16b
555*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v6.16b
556*4882a593Smuzhiyun	next_tweak	v7, v6, v8
557*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v7.16b
558*4882a593Smuzhiyun	bl		aes_decrypt_block4x
559*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v7.16b
560*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
561*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v5.16b
562*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v6.16b
563*4882a593Smuzhiyun	st1		{v0.16b-v3.16b}, [x0], #64
564*4882a593Smuzhiyun	mov		v4.16b, v7.16b
565*4882a593Smuzhiyun	cbz		w4, .Lxtsdecout
566*4882a593Smuzhiyun	xts_reload_mask	v8
567*4882a593Smuzhiyun	b		.LxtsdecloopNx
568*4882a593Smuzhiyun.Lxtsdec1x:
569*4882a593Smuzhiyun	adds		w4, w4, #64
570*4882a593Smuzhiyun	beq		.Lxtsdecout
571*4882a593Smuzhiyun	subs		w4, w4, #16
572*4882a593Smuzhiyun.Lxtsdecloop:
573*4882a593Smuzhiyun	ld1		{v0.16b}, [x1], #16
574*4882a593Smuzhiyun	bmi		.Lxtsdeccts
575*4882a593Smuzhiyun.Lxtsdecctsout:
576*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
577*4882a593Smuzhiyun	decrypt_block	v0, w3, x2, x8, w7
578*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
579*4882a593Smuzhiyun	st1		{v0.16b}, [x0], #16
580*4882a593Smuzhiyun	cbz		w4, .Lxtsdecout
581*4882a593Smuzhiyun	subs		w4, w4, #16
582*4882a593Smuzhiyun	next_tweak	v4, v4, v8
583*4882a593Smuzhiyun	b		.Lxtsdecloop
584*4882a593Smuzhiyun.Lxtsdecout:
585*4882a593Smuzhiyun	st1		{v4.16b}, [x6]
586*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
587*4882a593Smuzhiyun	ret
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun.Lxtsdeccts:
590*4882a593Smuzhiyun	adr_l		x8, .Lcts_permute_table
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun	add		x1, x1, w4, sxtw	/* rewind input pointer */
593*4882a593Smuzhiyun	add		w4, w4, #16		/* # bytes in final block */
594*4882a593Smuzhiyun	add		x9, x8, #32
595*4882a593Smuzhiyun	add		x8, x8, x4
596*4882a593Smuzhiyun	sub		x9, x9, x4
597*4882a593Smuzhiyun	add		x4, x0, x4		/* output address of final block */
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun	next_tweak	v5, v4, v8
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun	ld1		{v1.16b}, [x1]		/* load final block */
602*4882a593Smuzhiyun	ld1		{v2.16b}, [x8]
603*4882a593Smuzhiyun	ld1		{v3.16b}, [x9]
604*4882a593Smuzhiyun
605*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v5.16b
606*4882a593Smuzhiyun	decrypt_block	v0, w3, x2, x8, w7
607*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v5.16b
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun	tbl		v2.16b, {v0.16b}, v2.16b
610*4882a593Smuzhiyun	tbx		v0.16b, {v1.16b}, v3.16b
611*4882a593Smuzhiyun
612*4882a593Smuzhiyun	st1		{v2.16b}, [x4]			/* overlapping stores */
613*4882a593Smuzhiyun	mov		w4, wzr
614*4882a593Smuzhiyun	b		.Lxtsdecctsout
615*4882a593SmuzhiyunAES_FUNC_END(aes_xts_decrypt)
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun	/*
618*4882a593Smuzhiyun	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
619*4882a593Smuzhiyun	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
620*4882a593Smuzhiyun	 */
621*4882a593SmuzhiyunAES_FUNC_START(aes_mac_update)
622*4882a593Smuzhiyun	ld1		{v0.16b}, [x4]			/* get dg */
623*4882a593Smuzhiyun	enc_prepare	w2, x1, x7
624*4882a593Smuzhiyun	cbz		w5, .Lmacloop4x
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun	encrypt_block	v0, w2, x1, x7, w8
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun.Lmacloop4x:
629*4882a593Smuzhiyun	subs		w3, w3, #4
630*4882a593Smuzhiyun	bmi		.Lmac1x
631*4882a593Smuzhiyun	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
632*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
633*4882a593Smuzhiyun	encrypt_block	v0, w2, x1, x7, w8
634*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v2.16b
635*4882a593Smuzhiyun	encrypt_block	v0, w2, x1, x7, w8
636*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v3.16b
637*4882a593Smuzhiyun	encrypt_block	v0, w2, x1, x7, w8
638*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v4.16b
639*4882a593Smuzhiyun	cmp		w3, wzr
640*4882a593Smuzhiyun	csinv		x5, x6, xzr, eq
641*4882a593Smuzhiyun	cbz		w5, .Lmacout
642*4882a593Smuzhiyun	encrypt_block	v0, w2, x1, x7, w8
643*4882a593Smuzhiyun	st1		{v0.16b}, [x4]			/* return dg */
644*4882a593Smuzhiyun	cond_yield	.Lmacout, x7, x8
645*4882a593Smuzhiyun	b		.Lmacloop4x
646*4882a593Smuzhiyun.Lmac1x:
647*4882a593Smuzhiyun	add		w3, w3, #4
648*4882a593Smuzhiyun.Lmacloop:
649*4882a593Smuzhiyun	cbz		w3, .Lmacout
650*4882a593Smuzhiyun	ld1		{v1.16b}, [x0], #16		/* get next pt block */
651*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun	subs		w3, w3, #1
654*4882a593Smuzhiyun	csinv		x5, x6, xzr, eq
655*4882a593Smuzhiyun	cbz		w5, .Lmacout
656*4882a593Smuzhiyun
657*4882a593Smuzhiyun.Lmacenc:
658*4882a593Smuzhiyun	encrypt_block	v0, w2, x1, x7, w8
659*4882a593Smuzhiyun	b		.Lmacloop
660*4882a593Smuzhiyun
661*4882a593Smuzhiyun.Lmacout:
662*4882a593Smuzhiyun	st1		{v0.16b}, [x4]			/* return dg */
663*4882a593Smuzhiyun	mov		w0, w3
664*4882a593Smuzhiyun	ret
665*4882a593SmuzhiyunAES_FUNC_END(aes_mac_update)
666