xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/aes-ce-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/assembler.h>
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun	.text
12*4882a593Smuzhiyun	.arch		armv8-a
13*4882a593Smuzhiyun	.fpu		crypto-neon-fp-armv8
14*4882a593Smuzhiyun	.align		3
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun	.macro		enc_round, state, key
17*4882a593Smuzhiyun	aese.8		\state, \key
18*4882a593Smuzhiyun	aesmc.8		\state, \state
19*4882a593Smuzhiyun	.endm
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun	.macro		dec_round, state, key
22*4882a593Smuzhiyun	aesd.8		\state, \key
23*4882a593Smuzhiyun	aesimc.8	\state, \state
24*4882a593Smuzhiyun	.endm
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun	.macro		enc_dround, key1, key2
27*4882a593Smuzhiyun	enc_round	q0, \key1
28*4882a593Smuzhiyun	enc_round	q0, \key2
29*4882a593Smuzhiyun	.endm
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun	.macro		dec_dround, key1, key2
32*4882a593Smuzhiyun	dec_round	q0, \key1
33*4882a593Smuzhiyun	dec_round	q0, \key2
34*4882a593Smuzhiyun	.endm
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun	.macro		enc_fround, key1, key2, key3
37*4882a593Smuzhiyun	enc_round	q0, \key1
38*4882a593Smuzhiyun	aese.8		q0, \key2
39*4882a593Smuzhiyun	veor		q0, q0, \key3
40*4882a593Smuzhiyun	.endm
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	.macro		dec_fround, key1, key2, key3
43*4882a593Smuzhiyun	dec_round	q0, \key1
44*4882a593Smuzhiyun	aesd.8		q0, \key2
45*4882a593Smuzhiyun	veor		q0, q0, \key3
46*4882a593Smuzhiyun	.endm
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun	.macro		enc_dround_4x, key1, key2
49*4882a593Smuzhiyun	enc_round	q0, \key1
50*4882a593Smuzhiyun	enc_round	q1, \key1
51*4882a593Smuzhiyun	enc_round	q2, \key1
52*4882a593Smuzhiyun	enc_round	q3, \key1
53*4882a593Smuzhiyun	enc_round	q0, \key2
54*4882a593Smuzhiyun	enc_round	q1, \key2
55*4882a593Smuzhiyun	enc_round	q2, \key2
56*4882a593Smuzhiyun	enc_round	q3, \key2
57*4882a593Smuzhiyun	.endm
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun	.macro		dec_dround_4x, key1, key2
60*4882a593Smuzhiyun	dec_round	q0, \key1
61*4882a593Smuzhiyun	dec_round	q1, \key1
62*4882a593Smuzhiyun	dec_round	q2, \key1
63*4882a593Smuzhiyun	dec_round	q3, \key1
64*4882a593Smuzhiyun	dec_round	q0, \key2
65*4882a593Smuzhiyun	dec_round	q1, \key2
66*4882a593Smuzhiyun	dec_round	q2, \key2
67*4882a593Smuzhiyun	dec_round	q3, \key2
68*4882a593Smuzhiyun	.endm
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	.macro		enc_fround_4x, key1, key2, key3
71*4882a593Smuzhiyun	enc_round	q0, \key1
72*4882a593Smuzhiyun	enc_round	q1, \key1
73*4882a593Smuzhiyun	enc_round	q2, \key1
74*4882a593Smuzhiyun	enc_round	q3, \key1
75*4882a593Smuzhiyun	aese.8		q0, \key2
76*4882a593Smuzhiyun	aese.8		q1, \key2
77*4882a593Smuzhiyun	aese.8		q2, \key2
78*4882a593Smuzhiyun	aese.8		q3, \key2
79*4882a593Smuzhiyun	veor		q0, q0, \key3
80*4882a593Smuzhiyun	veor		q1, q1, \key3
81*4882a593Smuzhiyun	veor		q2, q2, \key3
82*4882a593Smuzhiyun	veor		q3, q3, \key3
83*4882a593Smuzhiyun	.endm
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun	.macro		dec_fround_4x, key1, key2, key3
86*4882a593Smuzhiyun	dec_round	q0, \key1
87*4882a593Smuzhiyun	dec_round	q1, \key1
88*4882a593Smuzhiyun	dec_round	q2, \key1
89*4882a593Smuzhiyun	dec_round	q3, \key1
90*4882a593Smuzhiyun	aesd.8		q0, \key2
91*4882a593Smuzhiyun	aesd.8		q1, \key2
92*4882a593Smuzhiyun	aesd.8		q2, \key2
93*4882a593Smuzhiyun	aesd.8		q3, \key2
94*4882a593Smuzhiyun	veor		q0, q0, \key3
95*4882a593Smuzhiyun	veor		q1, q1, \key3
96*4882a593Smuzhiyun	veor		q2, q2, \key3
97*4882a593Smuzhiyun	veor		q3, q3, \key3
98*4882a593Smuzhiyun	.endm
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun	.macro		do_block, dround, fround
101*4882a593Smuzhiyun	cmp		r3, #12			@ which key size?
102*4882a593Smuzhiyun	vld1.32		{q10-q11}, [ip]!
103*4882a593Smuzhiyun	\dround		q8, q9
104*4882a593Smuzhiyun	vld1.32		{q12-q13}, [ip]!
105*4882a593Smuzhiyun	\dround		q10, q11
106*4882a593Smuzhiyun	vld1.32		{q10-q11}, [ip]!
107*4882a593Smuzhiyun	\dround		q12, q13
108*4882a593Smuzhiyun	vld1.32		{q12-q13}, [ip]!
109*4882a593Smuzhiyun	\dround		q10, q11
110*4882a593Smuzhiyun	blo		0f			@ AES-128: 10 rounds
111*4882a593Smuzhiyun	vld1.32		{q10-q11}, [ip]!
112*4882a593Smuzhiyun	\dround		q12, q13
113*4882a593Smuzhiyun	beq		1f			@ AES-192: 12 rounds
114*4882a593Smuzhiyun	vld1.32		{q12-q13}, [ip]
115*4882a593Smuzhiyun	\dround		q10, q11
116*4882a593Smuzhiyun0:	\fround		q12, q13, q14
117*4882a593Smuzhiyun	bx		lr
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun1:	\fround		q10, q11, q14
120*4882a593Smuzhiyun	bx		lr
121*4882a593Smuzhiyun	.endm
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun	/*
124*4882a593Smuzhiyun	 * Internal, non-AAPCS compliant functions that implement the core AES
125*4882a593Smuzhiyun	 * transforms. These should preserve all registers except q0 - q2 and ip
126*4882a593Smuzhiyun	 * Arguments:
127*4882a593Smuzhiyun	 *   q0        : first in/output block
128*4882a593Smuzhiyun	 *   q1        : second in/output block (_4x version only)
129*4882a593Smuzhiyun	 *   q2        : third in/output block (_4x version only)
130*4882a593Smuzhiyun	 *   q3        : fourth in/output block (_4x version only)
131*4882a593Smuzhiyun	 *   q8        : first round key
132*4882a593Smuzhiyun	 *   q9        : secound round key
133*4882a593Smuzhiyun	 *   q14       : final round key
134*4882a593Smuzhiyun	 *   r2        : address of round key array
135*4882a593Smuzhiyun	 *   r3        : number of rounds
136*4882a593Smuzhiyun	 */
137*4882a593Smuzhiyun	.align		6
138*4882a593Smuzhiyunaes_encrypt:
139*4882a593Smuzhiyun	add		ip, r2, #32		@ 3rd round key
140*4882a593Smuzhiyun.Laes_encrypt_tweak:
141*4882a593Smuzhiyun	do_block	enc_dround, enc_fround
142*4882a593SmuzhiyunENDPROC(aes_encrypt)
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun	.align		6
145*4882a593Smuzhiyunaes_decrypt:
146*4882a593Smuzhiyun	add		ip, r2, #32		@ 3rd round key
147*4882a593Smuzhiyun	do_block	dec_dround, dec_fround
148*4882a593SmuzhiyunENDPROC(aes_decrypt)
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun	.align		6
151*4882a593Smuzhiyunaes_encrypt_4x:
152*4882a593Smuzhiyun	add		ip, r2, #32		@ 3rd round key
153*4882a593Smuzhiyun	do_block	enc_dround_4x, enc_fround_4x
154*4882a593SmuzhiyunENDPROC(aes_encrypt_4x)
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun	.align		6
157*4882a593Smuzhiyunaes_decrypt_4x:
158*4882a593Smuzhiyun	add		ip, r2, #32		@ 3rd round key
159*4882a593Smuzhiyun	do_block	dec_dround_4x, dec_fround_4x
160*4882a593SmuzhiyunENDPROC(aes_decrypt_4x)
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun	.macro		prepare_key, rk, rounds
163*4882a593Smuzhiyun	add		ip, \rk, \rounds, lsl #4
164*4882a593Smuzhiyun	vld1.32		{q8-q9}, [\rk]		@ load first 2 round keys
165*4882a593Smuzhiyun	vld1.32		{q14}, [ip]		@ load last round key
166*4882a593Smuzhiyun	.endm
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	/*
169*4882a593Smuzhiyun	 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170*4882a593Smuzhiyun	 *		   int blocks)
171*4882a593Smuzhiyun	 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172*4882a593Smuzhiyun	 *		   int blocks)
173*4882a593Smuzhiyun	 */
174*4882a593SmuzhiyunENTRY(ce_aes_ecb_encrypt)
175*4882a593Smuzhiyun	push		{r4, lr}
176*4882a593Smuzhiyun	ldr		r4, [sp, #8]
177*4882a593Smuzhiyun	prepare_key	r2, r3
178*4882a593Smuzhiyun.Lecbencloop4x:
179*4882a593Smuzhiyun	subs		r4, r4, #4
180*4882a593Smuzhiyun	bmi		.Lecbenc1x
181*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r1]!
182*4882a593Smuzhiyun	vld1.8		{q2-q3}, [r1]!
183*4882a593Smuzhiyun	bl		aes_encrypt_4x
184*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0]!
185*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0]!
186*4882a593Smuzhiyun	b		.Lecbencloop4x
187*4882a593Smuzhiyun.Lecbenc1x:
188*4882a593Smuzhiyun	adds		r4, r4, #4
189*4882a593Smuzhiyun	beq		.Lecbencout
190*4882a593Smuzhiyun.Lecbencloop:
191*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!
192*4882a593Smuzhiyun	bl		aes_encrypt
193*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
194*4882a593Smuzhiyun	subs		r4, r4, #1
195*4882a593Smuzhiyun	bne		.Lecbencloop
196*4882a593Smuzhiyun.Lecbencout:
197*4882a593Smuzhiyun	pop		{r4, pc}
198*4882a593SmuzhiyunENDPROC(ce_aes_ecb_encrypt)
199*4882a593Smuzhiyun
200*4882a593SmuzhiyunENTRY(ce_aes_ecb_decrypt)
201*4882a593Smuzhiyun	push		{r4, lr}
202*4882a593Smuzhiyun	ldr		r4, [sp, #8]
203*4882a593Smuzhiyun	prepare_key	r2, r3
204*4882a593Smuzhiyun.Lecbdecloop4x:
205*4882a593Smuzhiyun	subs		r4, r4, #4
206*4882a593Smuzhiyun	bmi		.Lecbdec1x
207*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r1]!
208*4882a593Smuzhiyun	vld1.8		{q2-q3}, [r1]!
209*4882a593Smuzhiyun	bl		aes_decrypt_4x
210*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0]!
211*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0]!
212*4882a593Smuzhiyun	b		.Lecbdecloop4x
213*4882a593Smuzhiyun.Lecbdec1x:
214*4882a593Smuzhiyun	adds		r4, r4, #4
215*4882a593Smuzhiyun	beq		.Lecbdecout
216*4882a593Smuzhiyun.Lecbdecloop:
217*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!
218*4882a593Smuzhiyun	bl		aes_decrypt
219*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
220*4882a593Smuzhiyun	subs		r4, r4, #1
221*4882a593Smuzhiyun	bne		.Lecbdecloop
222*4882a593Smuzhiyun.Lecbdecout:
223*4882a593Smuzhiyun	pop		{r4, pc}
224*4882a593SmuzhiyunENDPROC(ce_aes_ecb_decrypt)
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun	/*
227*4882a593Smuzhiyun	 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228*4882a593Smuzhiyun	 *		   int blocks, u8 iv[])
229*4882a593Smuzhiyun	 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230*4882a593Smuzhiyun	 *		   int blocks, u8 iv[])
231*4882a593Smuzhiyun	 */
232*4882a593SmuzhiyunENTRY(ce_aes_cbc_encrypt)
233*4882a593Smuzhiyun	push		{r4-r6, lr}
234*4882a593Smuzhiyun	ldrd		r4, r5, [sp, #16]
235*4882a593Smuzhiyun	vld1.8		{q0}, [r5]
236*4882a593Smuzhiyun	prepare_key	r2, r3
237*4882a593Smuzhiyun.Lcbcencloop:
238*4882a593Smuzhiyun	vld1.8		{q1}, [r1]!		@ get next pt block
239*4882a593Smuzhiyun	veor		q0, q0, q1		@ ..and xor with iv
240*4882a593Smuzhiyun	bl		aes_encrypt
241*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
242*4882a593Smuzhiyun	subs		r4, r4, #1
243*4882a593Smuzhiyun	bne		.Lcbcencloop
244*4882a593Smuzhiyun	vst1.8		{q0}, [r5]
245*4882a593Smuzhiyun	pop		{r4-r6, pc}
246*4882a593SmuzhiyunENDPROC(ce_aes_cbc_encrypt)
247*4882a593Smuzhiyun
248*4882a593SmuzhiyunENTRY(ce_aes_cbc_decrypt)
249*4882a593Smuzhiyun	push		{r4-r6, lr}
250*4882a593Smuzhiyun	ldrd		r4, r5, [sp, #16]
251*4882a593Smuzhiyun	vld1.8		{q15}, [r5]		@ keep iv in q15
252*4882a593Smuzhiyun	prepare_key	r2, r3
253*4882a593Smuzhiyun.Lcbcdecloop4x:
254*4882a593Smuzhiyun	subs		r4, r4, #4
255*4882a593Smuzhiyun	bmi		.Lcbcdec1x
256*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r1]!
257*4882a593Smuzhiyun	vld1.8		{q2-q3}, [r1]!
258*4882a593Smuzhiyun	vmov		q4, q0
259*4882a593Smuzhiyun	vmov		q5, q1
260*4882a593Smuzhiyun	vmov		q6, q2
261*4882a593Smuzhiyun	vmov		q7, q3
262*4882a593Smuzhiyun	bl		aes_decrypt_4x
263*4882a593Smuzhiyun	veor		q0, q0, q15
264*4882a593Smuzhiyun	veor		q1, q1, q4
265*4882a593Smuzhiyun	veor		q2, q2, q5
266*4882a593Smuzhiyun	veor		q3, q3, q6
267*4882a593Smuzhiyun	vmov		q15, q7
268*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0]!
269*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0]!
270*4882a593Smuzhiyun	b		.Lcbcdecloop4x
271*4882a593Smuzhiyun.Lcbcdec1x:
272*4882a593Smuzhiyun	adds		r4, r4, #4
273*4882a593Smuzhiyun	beq		.Lcbcdecout
274*4882a593Smuzhiyun	vmov		q6, q14			@ preserve last round key
275*4882a593Smuzhiyun.Lcbcdecloop:
276*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!		@ get next ct block
277*4882a593Smuzhiyun	veor		q14, q15, q6		@ combine prev ct with last key
278*4882a593Smuzhiyun	vmov		q15, q0
279*4882a593Smuzhiyun	bl		aes_decrypt
280*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
281*4882a593Smuzhiyun	subs		r4, r4, #1
282*4882a593Smuzhiyun	bne		.Lcbcdecloop
283*4882a593Smuzhiyun.Lcbcdecout:
284*4882a593Smuzhiyun	vst1.8		{q15}, [r5]		@ keep iv in q15
285*4882a593Smuzhiyun	pop		{r4-r6, pc}
286*4882a593SmuzhiyunENDPROC(ce_aes_cbc_decrypt)
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun	/*
290*4882a593Smuzhiyun	 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291*4882a593Smuzhiyun	 *			  int rounds, int bytes, u8 const iv[])
292*4882a593Smuzhiyun	 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293*4882a593Smuzhiyun	 *			  int rounds, int bytes, u8 const iv[])
294*4882a593Smuzhiyun	 */
295*4882a593Smuzhiyun
296*4882a593SmuzhiyunENTRY(ce_aes_cbc_cts_encrypt)
297*4882a593Smuzhiyun	push		{r4-r6, lr}
298*4882a593Smuzhiyun	ldrd		r4, r5, [sp, #16]
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun	movw		ip, :lower16:.Lcts_permute_table
301*4882a593Smuzhiyun	movt		ip, :upper16:.Lcts_permute_table
302*4882a593Smuzhiyun	sub		r4, r4, #16
303*4882a593Smuzhiyun	add		lr, ip, #32
304*4882a593Smuzhiyun	add		ip, ip, r4
305*4882a593Smuzhiyun	sub		lr, lr, r4
306*4882a593Smuzhiyun	vld1.8		{q5}, [ip]
307*4882a593Smuzhiyun	vld1.8		{q6}, [lr]
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun	add		ip, r1, r4
310*4882a593Smuzhiyun	vld1.8		{q0}, [r1]			@ overlapping loads
311*4882a593Smuzhiyun	vld1.8		{q3}, [ip]
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun	vld1.8		{q1}, [r5]			@ get iv
314*4882a593Smuzhiyun	prepare_key	r2, r3
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun	veor		q0, q0, q1			@ xor with iv
317*4882a593Smuzhiyun	bl		aes_encrypt
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun	vtbl.8		d4, {d0-d1}, d10
320*4882a593Smuzhiyun	vtbl.8		d5, {d0-d1}, d11
321*4882a593Smuzhiyun	vtbl.8		d2, {d6-d7}, d12
322*4882a593Smuzhiyun	vtbl.8		d3, {d6-d7}, d13
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun	veor		q0, q0, q1
325*4882a593Smuzhiyun	bl		aes_encrypt
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun	add		r4, r0, r4
328*4882a593Smuzhiyun	vst1.8		{q2}, [r4]			@ overlapping stores
329*4882a593Smuzhiyun	vst1.8		{q0}, [r0]
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun	pop		{r4-r6, pc}
332*4882a593SmuzhiyunENDPROC(ce_aes_cbc_cts_encrypt)
333*4882a593Smuzhiyun
334*4882a593SmuzhiyunENTRY(ce_aes_cbc_cts_decrypt)
335*4882a593Smuzhiyun	push		{r4-r6, lr}
336*4882a593Smuzhiyun	ldrd		r4, r5, [sp, #16]
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun	movw		ip, :lower16:.Lcts_permute_table
339*4882a593Smuzhiyun	movt		ip, :upper16:.Lcts_permute_table
340*4882a593Smuzhiyun	sub		r4, r4, #16
341*4882a593Smuzhiyun	add		lr, ip, #32
342*4882a593Smuzhiyun	add		ip, ip, r4
343*4882a593Smuzhiyun	sub		lr, lr, r4
344*4882a593Smuzhiyun	vld1.8		{q5}, [ip]
345*4882a593Smuzhiyun	vld1.8		{q6}, [lr]
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun	add		ip, r1, r4
348*4882a593Smuzhiyun	vld1.8		{q0}, [r1]			@ overlapping loads
349*4882a593Smuzhiyun	vld1.8		{q1}, [ip]
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun	vld1.8		{q3}, [r5]			@ get iv
352*4882a593Smuzhiyun	prepare_key	r2, r3
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	bl		aes_decrypt
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun	vtbl.8		d4, {d0-d1}, d10
357*4882a593Smuzhiyun	vtbl.8		d5, {d0-d1}, d11
358*4882a593Smuzhiyun	vtbx.8		d0, {d2-d3}, d12
359*4882a593Smuzhiyun	vtbx.8		d1, {d2-d3}, d13
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun	veor		q1, q1, q2
362*4882a593Smuzhiyun	bl		aes_decrypt
363*4882a593Smuzhiyun	veor		q0, q0, q3			@ xor with iv
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun	add		r4, r0, r4
366*4882a593Smuzhiyun	vst1.8		{q1}, [r4]			@ overlapping stores
367*4882a593Smuzhiyun	vst1.8		{q0}, [r0]
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun	pop		{r4-r6, pc}
370*4882a593SmuzhiyunENDPROC(ce_aes_cbc_cts_decrypt)
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun	/*
374*4882a593Smuzhiyun	 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375*4882a593Smuzhiyun	 *		   int blocks, u8 ctr[])
376*4882a593Smuzhiyun	 */
377*4882a593SmuzhiyunENTRY(ce_aes_ctr_encrypt)
378*4882a593Smuzhiyun	push		{r4-r6, lr}
379*4882a593Smuzhiyun	ldrd		r4, r5, [sp, #16]
380*4882a593Smuzhiyun	vld1.8		{q7}, [r5]		@ load ctr
381*4882a593Smuzhiyun	prepare_key	r2, r3
382*4882a593Smuzhiyun	vmov		r6, s31			@ keep swabbed ctr in r6
383*4882a593Smuzhiyun	rev		r6, r6
384*4882a593Smuzhiyun	cmn		r6, r4			@ 32 bit overflow?
385*4882a593Smuzhiyun	bcs		.Lctrloop
386*4882a593Smuzhiyun.Lctrloop4x:
387*4882a593Smuzhiyun	subs		r4, r4, #4
388*4882a593Smuzhiyun	bmi		.Lctr1x
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun	/*
391*4882a593Smuzhiyun	 * NOTE: the sequence below has been carefully tweaked to avoid
392*4882a593Smuzhiyun	 * a silicon erratum that exists in Cortex-A57 (#1742098) and
393*4882a593Smuzhiyun	 * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394*4882a593Smuzhiyun	 * may produce an incorrect result if they take their input from a
395*4882a593Smuzhiyun	 * register of which a single 32-bit lane has been updated the last
396*4882a593Smuzhiyun	 * time it was modified. To work around this, the lanes of registers
397*4882a593Smuzhiyun	 * q0-q3 below are not manipulated individually, and the different
398*4882a593Smuzhiyun	 * counter values are prepared by successive manipulations of q7.
399*4882a593Smuzhiyun	 */
400*4882a593Smuzhiyun	add		ip, r6, #1
401*4882a593Smuzhiyun	vmov		q0, q7
402*4882a593Smuzhiyun	rev		ip, ip
403*4882a593Smuzhiyun	add		lr, r6, #2
404*4882a593Smuzhiyun	vmov		s31, ip			@ set lane 3 of q1 via q7
405*4882a593Smuzhiyun	add		ip, r6, #3
406*4882a593Smuzhiyun	rev		lr, lr
407*4882a593Smuzhiyun	vmov		q1, q7
408*4882a593Smuzhiyun	vmov		s31, lr			@ set lane 3 of q2 via q7
409*4882a593Smuzhiyun	rev		ip, ip
410*4882a593Smuzhiyun	vmov		q2, q7
411*4882a593Smuzhiyun	vmov		s31, ip			@ set lane 3 of q3 via q7
412*4882a593Smuzhiyun	add		r6, r6, #4
413*4882a593Smuzhiyun	vmov		q3, q7
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun	vld1.8		{q4-q5}, [r1]!
416*4882a593Smuzhiyun	vld1.8		{q6}, [r1]!
417*4882a593Smuzhiyun	vld1.8		{q15}, [r1]!
418*4882a593Smuzhiyun	bl		aes_encrypt_4x
419*4882a593Smuzhiyun	veor		q0, q0, q4
420*4882a593Smuzhiyun	veor		q1, q1, q5
421*4882a593Smuzhiyun	veor		q2, q2, q6
422*4882a593Smuzhiyun	veor		q3, q3, q15
423*4882a593Smuzhiyun	rev		ip, r6
424*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0]!
425*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0]!
426*4882a593Smuzhiyun	vmov		s31, ip
427*4882a593Smuzhiyun	b		.Lctrloop4x
428*4882a593Smuzhiyun.Lctr1x:
429*4882a593Smuzhiyun	adds		r4, r4, #4
430*4882a593Smuzhiyun	beq		.Lctrout
431*4882a593Smuzhiyun.Lctrloop:
432*4882a593Smuzhiyun	vmov		q0, q7
433*4882a593Smuzhiyun	bl		aes_encrypt
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun	adds		r6, r6, #1		@ increment BE ctr
436*4882a593Smuzhiyun	rev		ip, r6
437*4882a593Smuzhiyun	vmov		s31, ip
438*4882a593Smuzhiyun	bcs		.Lctrcarry
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun.Lctrcarrydone:
441*4882a593Smuzhiyun	subs		r4, r4, #1
442*4882a593Smuzhiyun	bmi		.Lctrtailblock		@ blocks < 0 means tail block
443*4882a593Smuzhiyun	vld1.8		{q3}, [r1]!
444*4882a593Smuzhiyun	veor		q3, q0, q3
445*4882a593Smuzhiyun	vst1.8		{q3}, [r0]!
446*4882a593Smuzhiyun	bne		.Lctrloop
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun.Lctrout:
449*4882a593Smuzhiyun	vst1.8		{q7}, [r5]		@ return next CTR value
450*4882a593Smuzhiyun	pop		{r4-r6, pc}
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun.Lctrtailblock:
453*4882a593Smuzhiyun	vst1.8		{q0}, [r0, :64]		@ return the key stream
454*4882a593Smuzhiyun	b		.Lctrout
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun.Lctrcarry:
457*4882a593Smuzhiyun	.irp		sreg, s30, s29, s28
458*4882a593Smuzhiyun	vmov		ip, \sreg		@ load next word of ctr
459*4882a593Smuzhiyun	rev		ip, ip			@ ... to handle the carry
460*4882a593Smuzhiyun	adds		ip, ip, #1
461*4882a593Smuzhiyun	rev		ip, ip
462*4882a593Smuzhiyun	vmov		\sreg, ip
463*4882a593Smuzhiyun	bcc		.Lctrcarrydone
464*4882a593Smuzhiyun	.endr
465*4882a593Smuzhiyun	b		.Lctrcarrydone
466*4882a593SmuzhiyunENDPROC(ce_aes_ctr_encrypt)
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun	/*
469*4882a593Smuzhiyun	 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470*4882a593Smuzhiyun	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
471*4882a593Smuzhiyun	 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472*4882a593Smuzhiyun	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
473*4882a593Smuzhiyun	 */
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun	.macro		next_tweak, out, in, const, tmp
476*4882a593Smuzhiyun	vshr.s64	\tmp, \in, #63
477*4882a593Smuzhiyun	vand		\tmp, \tmp, \const
478*4882a593Smuzhiyun	vadd.u64	\out, \in, \in
479*4882a593Smuzhiyun	vext.8		\tmp, \tmp, \tmp, #8
480*4882a593Smuzhiyun	veor		\out, \out, \tmp
481*4882a593Smuzhiyun	.endm
482*4882a593Smuzhiyun
483*4882a593Smuzhiyunce_aes_xts_init:
484*4882a593Smuzhiyun	vmov.i32	d30, #0x87		@ compose tweak mask vector
485*4882a593Smuzhiyun	vmovl.u32	q15, d30
486*4882a593Smuzhiyun	vshr.u64	d30, d31, #7
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun	ldrd		r4, r5, [sp, #16]	@ load args
489*4882a593Smuzhiyun	ldr		r6, [sp, #28]
490*4882a593Smuzhiyun	vld1.8		{q0}, [r5]		@ load iv
491*4882a593Smuzhiyun	teq		r6, #1			@ start of a block?
492*4882a593Smuzhiyun	bxne		lr
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun	@ Encrypt the IV in q0 with the second AES key. This should only
495*4882a593Smuzhiyun	@ be done at the start of a block.
496*4882a593Smuzhiyun	ldr		r6, [sp, #24]		@ load AES key 2
497*4882a593Smuzhiyun	prepare_key	r6, r3
498*4882a593Smuzhiyun	add		ip, r6, #32		@ 3rd round key of key 2
499*4882a593Smuzhiyun	b		.Laes_encrypt_tweak	@ tail call
500*4882a593SmuzhiyunENDPROC(ce_aes_xts_init)
501*4882a593Smuzhiyun
502*4882a593SmuzhiyunENTRY(ce_aes_xts_encrypt)
503*4882a593Smuzhiyun	push		{r4-r6, lr}
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun	bl		ce_aes_xts_init		@ run shared prologue
506*4882a593Smuzhiyun	prepare_key	r2, r3
507*4882a593Smuzhiyun	vmov		q4, q0
508*4882a593Smuzhiyun
509*4882a593Smuzhiyun	teq		r6, #0			@ start of a block?
510*4882a593Smuzhiyun	bne		.Lxtsenc4x
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun.Lxtsencloop4x:
513*4882a593Smuzhiyun	next_tweak	q4, q4, q15, q10
514*4882a593Smuzhiyun.Lxtsenc4x:
515*4882a593Smuzhiyun	subs		r4, r4, #64
516*4882a593Smuzhiyun	bmi		.Lxtsenc1x
517*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r1]!		@ get 4 pt blocks
518*4882a593Smuzhiyun	vld1.8		{q2-q3}, [r1]!
519*4882a593Smuzhiyun	next_tweak	q5, q4, q15, q10
520*4882a593Smuzhiyun	veor		q0, q0, q4
521*4882a593Smuzhiyun	next_tweak	q6, q5, q15, q10
522*4882a593Smuzhiyun	veor		q1, q1, q5
523*4882a593Smuzhiyun	next_tweak	q7, q6, q15, q10
524*4882a593Smuzhiyun	veor		q2, q2, q6
525*4882a593Smuzhiyun	veor		q3, q3, q7
526*4882a593Smuzhiyun	bl		aes_encrypt_4x
527*4882a593Smuzhiyun	veor		q0, q0, q4
528*4882a593Smuzhiyun	veor		q1, q1, q5
529*4882a593Smuzhiyun	veor		q2, q2, q6
530*4882a593Smuzhiyun	veor		q3, q3, q7
531*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0]!		@ write 4 ct blocks
532*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0]!
533*4882a593Smuzhiyun	vmov		q4, q7
534*4882a593Smuzhiyun	teq		r4, #0
535*4882a593Smuzhiyun	beq		.Lxtsencret
536*4882a593Smuzhiyun	b		.Lxtsencloop4x
537*4882a593Smuzhiyun.Lxtsenc1x:
538*4882a593Smuzhiyun	adds		r4, r4, #64
539*4882a593Smuzhiyun	beq		.Lxtsencout
540*4882a593Smuzhiyun	subs		r4, r4, #16
541*4882a593Smuzhiyun	bmi		.LxtsencctsNx
542*4882a593Smuzhiyun.Lxtsencloop:
543*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!
544*4882a593Smuzhiyun.Lxtsencctsout:
545*4882a593Smuzhiyun	veor		q0, q0, q4
546*4882a593Smuzhiyun	bl		aes_encrypt
547*4882a593Smuzhiyun	veor		q0, q0, q4
548*4882a593Smuzhiyun	teq		r4, #0
549*4882a593Smuzhiyun	beq		.Lxtsencout
550*4882a593Smuzhiyun	subs		r4, r4, #16
551*4882a593Smuzhiyun	next_tweak	q4, q4, q15, q6
552*4882a593Smuzhiyun	bmi		.Lxtsenccts
553*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
554*4882a593Smuzhiyun	b		.Lxtsencloop
555*4882a593Smuzhiyun.Lxtsencout:
556*4882a593Smuzhiyun	vst1.8		{q0}, [r0]
557*4882a593Smuzhiyun.Lxtsencret:
558*4882a593Smuzhiyun	vst1.8		{q4}, [r5]
559*4882a593Smuzhiyun	pop		{r4-r6, pc}
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun.LxtsencctsNx:
562*4882a593Smuzhiyun	vmov		q0, q3
563*4882a593Smuzhiyun	sub		r0, r0, #16
564*4882a593Smuzhiyun.Lxtsenccts:
565*4882a593Smuzhiyun	movw		ip, :lower16:.Lcts_permute_table
566*4882a593Smuzhiyun	movt		ip, :upper16:.Lcts_permute_table
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun	add		r1, r1, r4		@ rewind input pointer
569*4882a593Smuzhiyun	add		r4, r4, #16		@ # bytes in final block
570*4882a593Smuzhiyun	add		lr, ip, #32
571*4882a593Smuzhiyun	add		ip, ip, r4
572*4882a593Smuzhiyun	sub		lr, lr, r4
573*4882a593Smuzhiyun	add		r4, r0, r4		@ output address of final block
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun	vld1.8		{q1}, [r1]		@ load final partial block
576*4882a593Smuzhiyun	vld1.8		{q2}, [ip]
577*4882a593Smuzhiyun	vld1.8		{q3}, [lr]
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun	vtbl.8		d4, {d0-d1}, d4
580*4882a593Smuzhiyun	vtbl.8		d5, {d0-d1}, d5
581*4882a593Smuzhiyun	vtbx.8		d0, {d2-d3}, d6
582*4882a593Smuzhiyun	vtbx.8		d1, {d2-d3}, d7
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun	vst1.8		{q2}, [r4]		@ overlapping stores
585*4882a593Smuzhiyun	mov		r4, #0
586*4882a593Smuzhiyun	b		.Lxtsencctsout
587*4882a593SmuzhiyunENDPROC(ce_aes_xts_encrypt)
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun
590*4882a593SmuzhiyunENTRY(ce_aes_xts_decrypt)
591*4882a593Smuzhiyun	push		{r4-r6, lr}
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun	bl		ce_aes_xts_init		@ run shared prologue
594*4882a593Smuzhiyun	prepare_key	r2, r3
595*4882a593Smuzhiyun	vmov		q4, q0
596*4882a593Smuzhiyun
597*4882a593Smuzhiyun	/* subtract 16 bytes if we are doing CTS */
598*4882a593Smuzhiyun	tst		r4, #0xf
599*4882a593Smuzhiyun	subne		r4, r4, #0x10
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun	teq		r6, #0			@ start of a block?
602*4882a593Smuzhiyun	bne		.Lxtsdec4x
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun.Lxtsdecloop4x:
605*4882a593Smuzhiyun	next_tweak	q4, q4, q15, q10
606*4882a593Smuzhiyun.Lxtsdec4x:
607*4882a593Smuzhiyun	subs		r4, r4, #64
608*4882a593Smuzhiyun	bmi		.Lxtsdec1x
609*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r1]!		@ get 4 ct blocks
610*4882a593Smuzhiyun	vld1.8		{q2-q3}, [r1]!
611*4882a593Smuzhiyun	next_tweak	q5, q4, q15, q10
612*4882a593Smuzhiyun	veor		q0, q0, q4
613*4882a593Smuzhiyun	next_tweak	q6, q5, q15, q10
614*4882a593Smuzhiyun	veor		q1, q1, q5
615*4882a593Smuzhiyun	next_tweak	q7, q6, q15, q10
616*4882a593Smuzhiyun	veor		q2, q2, q6
617*4882a593Smuzhiyun	veor		q3, q3, q7
618*4882a593Smuzhiyun	bl		aes_decrypt_4x
619*4882a593Smuzhiyun	veor		q0, q0, q4
620*4882a593Smuzhiyun	veor		q1, q1, q5
621*4882a593Smuzhiyun	veor		q2, q2, q6
622*4882a593Smuzhiyun	veor		q3, q3, q7
623*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0]!		@ write 4 pt blocks
624*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0]!
625*4882a593Smuzhiyun	vmov		q4, q7
626*4882a593Smuzhiyun	teq		r4, #0
627*4882a593Smuzhiyun	beq		.Lxtsdecout
628*4882a593Smuzhiyun	b		.Lxtsdecloop4x
629*4882a593Smuzhiyun.Lxtsdec1x:
630*4882a593Smuzhiyun	adds		r4, r4, #64
631*4882a593Smuzhiyun	beq		.Lxtsdecout
632*4882a593Smuzhiyun	subs		r4, r4, #16
633*4882a593Smuzhiyun.Lxtsdecloop:
634*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!
635*4882a593Smuzhiyun	bmi		.Lxtsdeccts
636*4882a593Smuzhiyun.Lxtsdecctsout:
637*4882a593Smuzhiyun	veor		q0, q0, q4
638*4882a593Smuzhiyun	bl		aes_decrypt
639*4882a593Smuzhiyun	veor		q0, q0, q4
640*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
641*4882a593Smuzhiyun	teq		r4, #0
642*4882a593Smuzhiyun	beq		.Lxtsdecout
643*4882a593Smuzhiyun	subs		r4, r4, #16
644*4882a593Smuzhiyun	next_tweak	q4, q4, q15, q6
645*4882a593Smuzhiyun	b		.Lxtsdecloop
646*4882a593Smuzhiyun.Lxtsdecout:
647*4882a593Smuzhiyun	vst1.8		{q4}, [r5]
648*4882a593Smuzhiyun	pop		{r4-r6, pc}
649*4882a593Smuzhiyun
650*4882a593Smuzhiyun.Lxtsdeccts:
651*4882a593Smuzhiyun	movw		ip, :lower16:.Lcts_permute_table
652*4882a593Smuzhiyun	movt		ip, :upper16:.Lcts_permute_table
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun	add		r1, r1, r4		@ rewind input pointer
655*4882a593Smuzhiyun	add		r4, r4, #16		@ # bytes in final block
656*4882a593Smuzhiyun	add		lr, ip, #32
657*4882a593Smuzhiyun	add		ip, ip, r4
658*4882a593Smuzhiyun	sub		lr, lr, r4
659*4882a593Smuzhiyun	add		r4, r0, r4		@ output address of final block
660*4882a593Smuzhiyun
661*4882a593Smuzhiyun	next_tweak	q5, q4, q15, q6
662*4882a593Smuzhiyun
663*4882a593Smuzhiyun	vld1.8		{q1}, [r1]		@ load final partial block
664*4882a593Smuzhiyun	vld1.8		{q2}, [ip]
665*4882a593Smuzhiyun	vld1.8		{q3}, [lr]
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun	veor		q0, q0, q5
668*4882a593Smuzhiyun	bl		aes_decrypt
669*4882a593Smuzhiyun	veor		q0, q0, q5
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun	vtbl.8		d4, {d0-d1}, d4
672*4882a593Smuzhiyun	vtbl.8		d5, {d0-d1}, d5
673*4882a593Smuzhiyun	vtbx.8		d0, {d2-d3}, d6
674*4882a593Smuzhiyun	vtbx.8		d1, {d2-d3}, d7
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun	vst1.8		{q2}, [r4]		@ overlapping stores
677*4882a593Smuzhiyun	mov		r4, #0
678*4882a593Smuzhiyun	b		.Lxtsdecctsout
679*4882a593SmuzhiyunENDPROC(ce_aes_xts_decrypt)
680*4882a593Smuzhiyun
681*4882a593Smuzhiyun	/*
682*4882a593Smuzhiyun	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
683*4882a593Smuzhiyun	 *                             AES sbox substitution on each byte in
684*4882a593Smuzhiyun	 *                             'input'
685*4882a593Smuzhiyun	 */
686*4882a593SmuzhiyunENTRY(ce_aes_sub)
687*4882a593Smuzhiyun	vdup.32		q1, r0
688*4882a593Smuzhiyun	veor		q0, q0, q0
689*4882a593Smuzhiyun	aese.8		q0, q1
690*4882a593Smuzhiyun	vmov		r0, s0
691*4882a593Smuzhiyun	bx		lr
692*4882a593SmuzhiyunENDPROC(ce_aes_sub)
693*4882a593Smuzhiyun
694*4882a593Smuzhiyun	/*
695*4882a593Smuzhiyun	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
696*4882a593Smuzhiyun	 *                                        operation on round key *src
697*4882a593Smuzhiyun	 */
698*4882a593SmuzhiyunENTRY(ce_aes_invert)
699*4882a593Smuzhiyun	vld1.32		{q0}, [r1]
700*4882a593Smuzhiyun	aesimc.8	q0, q0
701*4882a593Smuzhiyun	vst1.32		{q0}, [r0]
702*4882a593Smuzhiyun	bx		lr
703*4882a593SmuzhiyunENDPROC(ce_aes_invert)
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun	.section	".rodata", "a"
706*4882a593Smuzhiyun	.align		6
707*4882a593Smuzhiyun.Lcts_permute_table:
708*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
709*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
710*4882a593Smuzhiyun	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
711*4882a593Smuzhiyun	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
712*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
713*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
714