xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/aes-neonbs-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Bit sliced AES using NEON instructions
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2017 Linaro Ltd.
6*4882a593Smuzhiyun * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
7*4882a593Smuzhiyun */
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun/*
10*4882a593Smuzhiyun * The algorithm implemented here is described in detail by the paper
11*4882a593Smuzhiyun * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
12*4882a593Smuzhiyun * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * This implementation is based primarily on the OpenSSL implementation
15*4882a593Smuzhiyun * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
16*4882a593Smuzhiyun */
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun#include <linux/linkage.h>
19*4882a593Smuzhiyun#include <asm/assembler.h>
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun	.text
22*4882a593Smuzhiyun	.fpu		neon
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun	rounds		.req	ip
25*4882a593Smuzhiyun	bskey		.req	r4
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun	q0l		.req	d0
28*4882a593Smuzhiyun	q0h		.req	d1
29*4882a593Smuzhiyun	q1l		.req	d2
30*4882a593Smuzhiyun	q1h		.req	d3
31*4882a593Smuzhiyun	q2l		.req	d4
32*4882a593Smuzhiyun	q2h		.req	d5
33*4882a593Smuzhiyun	q3l		.req	d6
34*4882a593Smuzhiyun	q3h		.req	d7
35*4882a593Smuzhiyun	q4l		.req	d8
36*4882a593Smuzhiyun	q4h		.req	d9
37*4882a593Smuzhiyun	q5l		.req	d10
38*4882a593Smuzhiyun	q5h		.req	d11
39*4882a593Smuzhiyun	q6l		.req	d12
40*4882a593Smuzhiyun	q6h		.req	d13
41*4882a593Smuzhiyun	q7l		.req	d14
42*4882a593Smuzhiyun	q7h		.req	d15
43*4882a593Smuzhiyun	q8l		.req	d16
44*4882a593Smuzhiyun	q8h		.req	d17
45*4882a593Smuzhiyun	q9l		.req	d18
46*4882a593Smuzhiyun	q9h		.req	d19
47*4882a593Smuzhiyun	q10l		.req	d20
48*4882a593Smuzhiyun	q10h		.req	d21
49*4882a593Smuzhiyun	q11l		.req	d22
50*4882a593Smuzhiyun	q11h		.req	d23
51*4882a593Smuzhiyun	q12l		.req	d24
52*4882a593Smuzhiyun	q12h		.req	d25
53*4882a593Smuzhiyun	q13l		.req	d26
54*4882a593Smuzhiyun	q13h		.req	d27
55*4882a593Smuzhiyun	q14l		.req	d28
56*4882a593Smuzhiyun	q14h		.req	d29
57*4882a593Smuzhiyun	q15l		.req	d30
58*4882a593Smuzhiyun	q15h		.req	d31
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun	.macro		__tbl, out, tbl, in, tmp
61*4882a593Smuzhiyun	.ifc		\out, \tbl
62*4882a593Smuzhiyun	.ifb		\tmp
63*4882a593Smuzhiyun	.error		__tbl needs temp register if out == tbl
64*4882a593Smuzhiyun	.endif
65*4882a593Smuzhiyun	vmov		\tmp, \out
66*4882a593Smuzhiyun	.endif
67*4882a593Smuzhiyun	vtbl.8		\out\()l, {\tbl}, \in\()l
68*4882a593Smuzhiyun	.ifc		\out, \tbl
69*4882a593Smuzhiyun	vtbl.8		\out\()h, {\tmp}, \in\()h
70*4882a593Smuzhiyun	.else
71*4882a593Smuzhiyun	vtbl.8		\out\()h, {\tbl}, \in\()h
72*4882a593Smuzhiyun	.endif
73*4882a593Smuzhiyun	.endm
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun	.macro		__ldr, out, sym
76*4882a593Smuzhiyun	vldr		\out\()l, \sym
77*4882a593Smuzhiyun	vldr		\out\()h, \sym + 8
78*4882a593Smuzhiyun	.endm
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
81*4882a593Smuzhiyun	veor		\b2, \b2, \b1
82*4882a593Smuzhiyun	veor		\b5, \b5, \b6
83*4882a593Smuzhiyun	veor		\b3, \b3, \b0
84*4882a593Smuzhiyun	veor		\b6, \b6, \b2
85*4882a593Smuzhiyun	veor		\b5, \b5, \b0
86*4882a593Smuzhiyun	veor		\b6, \b6, \b3
87*4882a593Smuzhiyun	veor		\b3, \b3, \b7
88*4882a593Smuzhiyun	veor		\b7, \b7, \b5
89*4882a593Smuzhiyun	veor		\b3, \b3, \b4
90*4882a593Smuzhiyun	veor		\b4, \b4, \b5
91*4882a593Smuzhiyun	veor		\b2, \b2, \b7
92*4882a593Smuzhiyun	veor		\b3, \b3, \b1
93*4882a593Smuzhiyun	veor		\b1, \b1, \b5
94*4882a593Smuzhiyun	.endm
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
97*4882a593Smuzhiyun	veor		\b0, \b0, \b6
98*4882a593Smuzhiyun	veor		\b1, \b1, \b4
99*4882a593Smuzhiyun	veor		\b4, \b4, \b6
100*4882a593Smuzhiyun	veor		\b2, \b2, \b0
101*4882a593Smuzhiyun	veor		\b6, \b6, \b1
102*4882a593Smuzhiyun	veor		\b1, \b1, \b5
103*4882a593Smuzhiyun	veor		\b5, \b5, \b3
104*4882a593Smuzhiyun	veor		\b3, \b3, \b7
105*4882a593Smuzhiyun	veor		\b7, \b7, \b5
106*4882a593Smuzhiyun	veor		\b2, \b2, \b5
107*4882a593Smuzhiyun	veor		\b4, \b4, \b7
108*4882a593Smuzhiyun	.endm
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
111*4882a593Smuzhiyun	veor		\b1, \b1, \b7
112*4882a593Smuzhiyun	veor		\b4, \b4, \b7
113*4882a593Smuzhiyun	veor		\b7, \b7, \b5
114*4882a593Smuzhiyun	veor		\b1, \b1, \b3
115*4882a593Smuzhiyun	veor		\b2, \b2, \b5
116*4882a593Smuzhiyun	veor		\b3, \b3, \b7
117*4882a593Smuzhiyun	veor		\b6, \b6, \b1
118*4882a593Smuzhiyun	veor		\b2, \b2, \b0
119*4882a593Smuzhiyun	veor		\b5, \b5, \b3
120*4882a593Smuzhiyun	veor		\b4, \b4, \b6
121*4882a593Smuzhiyun	veor		\b0, \b0, \b6
122*4882a593Smuzhiyun	veor		\b1, \b1, \b4
123*4882a593Smuzhiyun	.endm
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
126*4882a593Smuzhiyun	veor		\b1, \b1, \b5
127*4882a593Smuzhiyun	veor		\b2, \b2, \b7
128*4882a593Smuzhiyun	veor		\b3, \b3, \b1
129*4882a593Smuzhiyun	veor		\b4, \b4, \b5
130*4882a593Smuzhiyun	veor		\b7, \b7, \b5
131*4882a593Smuzhiyun	veor		\b3, \b3, \b4
132*4882a593Smuzhiyun	veor 		\b5, \b5, \b0
133*4882a593Smuzhiyun	veor		\b3, \b3, \b7
134*4882a593Smuzhiyun	veor		\b6, \b6, \b2
135*4882a593Smuzhiyun	veor		\b2, \b2, \b1
136*4882a593Smuzhiyun	veor		\b6, \b6, \b3
137*4882a593Smuzhiyun	veor		\b3, \b3, \b0
138*4882a593Smuzhiyun	veor		\b5, \b5, \b6
139*4882a593Smuzhiyun	.endm
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
142*4882a593Smuzhiyun	veor 		\t0, \y0, \y1
143*4882a593Smuzhiyun	vand		\t0, \t0, \x0
144*4882a593Smuzhiyun	veor		\x0, \x0, \x1
145*4882a593Smuzhiyun	vand		\t1, \x1, \y0
146*4882a593Smuzhiyun	vand		\x0, \x0, \y1
147*4882a593Smuzhiyun	veor		\x1, \t1, \t0
148*4882a593Smuzhiyun	veor		\x0, \x0, \t1
149*4882a593Smuzhiyun	.endm
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
152*4882a593Smuzhiyun	veor		\t0, \y0, \y1
153*4882a593Smuzhiyun	veor 		\t1, \y2, \y3
154*4882a593Smuzhiyun	vand		\t0, \t0, \x0
155*4882a593Smuzhiyun	vand		\t1, \t1, \x2
156*4882a593Smuzhiyun	veor		\x0, \x0, \x1
157*4882a593Smuzhiyun	veor		\x2, \x2, \x3
158*4882a593Smuzhiyun	vand		\x1, \x1, \y0
159*4882a593Smuzhiyun	vand		\x3, \x3, \y2
160*4882a593Smuzhiyun	vand		\x0, \x0, \y1
161*4882a593Smuzhiyun	vand		\x2, \x2, \y3
162*4882a593Smuzhiyun	veor		\x1, \x1, \x0
163*4882a593Smuzhiyun	veor		\x2, \x2, \x3
164*4882a593Smuzhiyun	veor		\x0, \x0, \t0
165*4882a593Smuzhiyun	veor		\x3, \x3, \t1
166*4882a593Smuzhiyun	.endm
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
169*4882a593Smuzhiyun				    y0, y1, y2, y3, t0, t1, t2, t3
170*4882a593Smuzhiyun	veor		\t0, \x0, \x2
171*4882a593Smuzhiyun	veor		\t1, \x1, \x3
172*4882a593Smuzhiyun	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
173*4882a593Smuzhiyun	veor		\y0, \y0, \y2
174*4882a593Smuzhiyun	veor		\y1, \y1, \y3
175*4882a593Smuzhiyun	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
176*4882a593Smuzhiyun	veor		\x0, \x0, \t0
177*4882a593Smuzhiyun	veor		\x2, \x2, \t0
178*4882a593Smuzhiyun	veor		\x1, \x1, \t1
179*4882a593Smuzhiyun	veor		\x3, \x3, \t1
180*4882a593Smuzhiyun	veor		\t0, \x4, \x6
181*4882a593Smuzhiyun	veor		\t1, \x5, \x7
182*4882a593Smuzhiyun	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
183*4882a593Smuzhiyun	veor		\y0, \y0, \y2
184*4882a593Smuzhiyun	veor		\y1, \y1, \y3
185*4882a593Smuzhiyun	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
186*4882a593Smuzhiyun	veor		\x4, \x4, \t0
187*4882a593Smuzhiyun	veor		\x6, \x6, \t0
188*4882a593Smuzhiyun	veor		\x5, \x5, \t1
189*4882a593Smuzhiyun	veor		\x7, \x7, \t1
190*4882a593Smuzhiyun	.endm
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
193*4882a593Smuzhiyun				   t0, t1, t2, t3, s0, s1, s2, s3
194*4882a593Smuzhiyun	veor		\t3, \x4, \x6
195*4882a593Smuzhiyun	veor		\t0, \x5, \x7
196*4882a593Smuzhiyun	veor		\t1, \x1, \x3
197*4882a593Smuzhiyun	veor		\s1, \x7, \x6
198*4882a593Smuzhiyun	veor		\s0, \x0, \x2
199*4882a593Smuzhiyun	veor		\s3, \t3, \t0
200*4882a593Smuzhiyun	vorr		\t2, \t0, \t1
201*4882a593Smuzhiyun	vand		\s2, \t3, \s0
202*4882a593Smuzhiyun	vorr		\t3, \t3, \s0
203*4882a593Smuzhiyun	veor		\s0, \s0, \t1
204*4882a593Smuzhiyun	vand		\t0, \t0, \t1
205*4882a593Smuzhiyun	veor		\t1, \x3, \x2
206*4882a593Smuzhiyun	vand		\s3, \s3, \s0
207*4882a593Smuzhiyun	vand		\s1, \s1, \t1
208*4882a593Smuzhiyun	veor		\t1, \x4, \x5
209*4882a593Smuzhiyun	veor		\s0, \x1, \x0
210*4882a593Smuzhiyun	veor		\t3, \t3, \s1
211*4882a593Smuzhiyun	veor		\t2, \t2, \s1
212*4882a593Smuzhiyun	vand		\s1, \t1, \s0
213*4882a593Smuzhiyun	vorr		\t1, \t1, \s0
214*4882a593Smuzhiyun	veor		\t3, \t3, \s3
215*4882a593Smuzhiyun	veor		\t0, \t0, \s1
216*4882a593Smuzhiyun	veor		\t2, \t2, \s2
217*4882a593Smuzhiyun	veor		\t1, \t1, \s3
218*4882a593Smuzhiyun	veor		\t0, \t0, \s2
219*4882a593Smuzhiyun	vand		\s0, \x7, \x3
220*4882a593Smuzhiyun	veor		\t1, \t1, \s2
221*4882a593Smuzhiyun	vand		\s1, \x6, \x2
222*4882a593Smuzhiyun	vand		\s2, \x5, \x1
223*4882a593Smuzhiyun	vorr		\s3, \x4, \x0
224*4882a593Smuzhiyun	veor		\t3, \t3, \s0
225*4882a593Smuzhiyun	veor		\t1, \t1, \s2
226*4882a593Smuzhiyun	veor		\s0, \t0, \s3
227*4882a593Smuzhiyun	veor		\t2, \t2, \s1
228*4882a593Smuzhiyun	vand		\s2, \t3, \t1
229*4882a593Smuzhiyun	veor		\s1, \t2, \s2
230*4882a593Smuzhiyun	veor		\s3, \s0, \s2
231*4882a593Smuzhiyun	vbsl		\s1, \t1, \s0
232*4882a593Smuzhiyun	vmvn		\t0, \s0
233*4882a593Smuzhiyun	vbsl		\s0, \s1, \s3
234*4882a593Smuzhiyun	vbsl		\t0, \s1, \s3
235*4882a593Smuzhiyun	vbsl		\s3, \t3, \t2
236*4882a593Smuzhiyun	veor		\t3, \t3, \t2
237*4882a593Smuzhiyun	vand		\s2, \s0, \s3
238*4882a593Smuzhiyun	veor		\t1, \t1, \t0
239*4882a593Smuzhiyun	veor		\s2, \s2, \t3
240*4882a593Smuzhiyun	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
241*4882a593Smuzhiyun			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
242*4882a593Smuzhiyun	.endm
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
245*4882a593Smuzhiyun			      t0, t1, t2, t3, s0, s1, s2, s3
246*4882a593Smuzhiyun	in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
247*4882a593Smuzhiyun	inv_gf256	\b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
248*4882a593Smuzhiyun			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
249*4882a593Smuzhiyun	out_bs_ch	\b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
250*4882a593Smuzhiyun	.endm
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
253*4882a593Smuzhiyun				  t0, t1, t2, t3, s0, s1, s2, s3
254*4882a593Smuzhiyun	inv_in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
255*4882a593Smuzhiyun	inv_gf256	\b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
256*4882a593Smuzhiyun			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
257*4882a593Smuzhiyun	inv_out_bs_ch	\b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
258*4882a593Smuzhiyun	.endm
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
261*4882a593Smuzhiyun				    t0, t1, t2, t3, mask
262*4882a593Smuzhiyun	vld1.8		{\t0-\t1}, [bskey, :256]!
263*4882a593Smuzhiyun	veor		\t0, \t0, \x0
264*4882a593Smuzhiyun	vld1.8		{\t2-\t3}, [bskey, :256]!
265*4882a593Smuzhiyun	veor		\t1, \t1, \x1
266*4882a593Smuzhiyun	__tbl		\x0, \t0, \mask
267*4882a593Smuzhiyun	veor		\t2, \t2, \x2
268*4882a593Smuzhiyun	__tbl		\x1, \t1, \mask
269*4882a593Smuzhiyun	vld1.8		{\t0-\t1}, [bskey, :256]!
270*4882a593Smuzhiyun	veor		\t3, \t3, \x3
271*4882a593Smuzhiyun	__tbl		\x2, \t2, \mask
272*4882a593Smuzhiyun	__tbl		\x3, \t3, \mask
273*4882a593Smuzhiyun	vld1.8		{\t2-\t3}, [bskey, :256]!
274*4882a593Smuzhiyun	veor		\t0, \t0, \x4
275*4882a593Smuzhiyun	veor		\t1, \t1, \x5
276*4882a593Smuzhiyun	__tbl		\x4, \t0, \mask
277*4882a593Smuzhiyun	veor		\t2, \t2, \x6
278*4882a593Smuzhiyun	__tbl		\x5, \t1, \mask
279*4882a593Smuzhiyun	veor		\t3, \t3, \x7
280*4882a593Smuzhiyun	__tbl		\x6, \t2, \mask
281*4882a593Smuzhiyun	__tbl		\x7, \t3, \mask
282*4882a593Smuzhiyun	.endm
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun	.macro		inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
285*4882a593Smuzhiyun					t0, t1, t2, t3, mask
286*4882a593Smuzhiyun	__tbl		\x0, \x0, \mask, \t0
287*4882a593Smuzhiyun	__tbl		\x1, \x1, \mask, \t1
288*4882a593Smuzhiyun	__tbl		\x2, \x2, \mask, \t2
289*4882a593Smuzhiyun	__tbl		\x3, \x3, \mask, \t3
290*4882a593Smuzhiyun	__tbl		\x4, \x4, \mask, \t0
291*4882a593Smuzhiyun	__tbl		\x5, \x5, \mask, \t1
292*4882a593Smuzhiyun	__tbl		\x6, \x6, \mask, \t2
293*4882a593Smuzhiyun	__tbl		\x7, \x7, \mask, \t3
294*4882a593Smuzhiyun	.endm
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
297*4882a593Smuzhiyun				  t0, t1, t2, t3, t4, t5, t6, t7, inv
298*4882a593Smuzhiyun	vext.8		\t0, \x0, \x0, #12
299*4882a593Smuzhiyun	vext.8		\t1, \x1, \x1, #12
300*4882a593Smuzhiyun	veor		\x0, \x0, \t0
301*4882a593Smuzhiyun	vext.8		\t2, \x2, \x2, #12
302*4882a593Smuzhiyun	veor		\x1, \x1, \t1
303*4882a593Smuzhiyun	vext.8		\t3, \x3, \x3, #12
304*4882a593Smuzhiyun	veor		\x2, \x2, \t2
305*4882a593Smuzhiyun	vext.8		\t4, \x4, \x4, #12
306*4882a593Smuzhiyun	veor		\x3, \x3, \t3
307*4882a593Smuzhiyun	vext.8		\t5, \x5, \x5, #12
308*4882a593Smuzhiyun	veor		\x4, \x4, \t4
309*4882a593Smuzhiyun	vext.8		\t6, \x6, \x6, #12
310*4882a593Smuzhiyun	veor		\x5, \x5, \t5
311*4882a593Smuzhiyun	vext.8		\t7, \x7, \x7, #12
312*4882a593Smuzhiyun	veor		\x6, \x6, \t6
313*4882a593Smuzhiyun	veor		\t1, \t1, \x0
314*4882a593Smuzhiyun	veor.8		\x7, \x7, \t7
315*4882a593Smuzhiyun	vext.8		\x0, \x0, \x0, #8
316*4882a593Smuzhiyun	veor		\t2, \t2, \x1
317*4882a593Smuzhiyun	veor		\t0, \t0, \x7
318*4882a593Smuzhiyun	veor		\t1, \t1, \x7
319*4882a593Smuzhiyun	vext.8		\x1, \x1, \x1, #8
320*4882a593Smuzhiyun	veor		\t5, \t5, \x4
321*4882a593Smuzhiyun	veor		\x0, \x0, \t0
322*4882a593Smuzhiyun	veor		\t6, \t6, \x5
323*4882a593Smuzhiyun	veor		\x1, \x1, \t1
324*4882a593Smuzhiyun	vext.8		\t0, \x4, \x4, #8
325*4882a593Smuzhiyun	veor		\t4, \t4, \x3
326*4882a593Smuzhiyun	vext.8		\t1, \x5, \x5, #8
327*4882a593Smuzhiyun	veor		\t7, \t7, \x6
328*4882a593Smuzhiyun	vext.8		\x4, \x3, \x3, #8
329*4882a593Smuzhiyun	veor		\t3, \t3, \x2
330*4882a593Smuzhiyun	vext.8		\x5, \x7, \x7, #8
331*4882a593Smuzhiyun	veor		\t4, \t4, \x7
332*4882a593Smuzhiyun	vext.8		\x3, \x6, \x6, #8
333*4882a593Smuzhiyun	veor		\t3, \t3, \x7
334*4882a593Smuzhiyun	vext.8		\x6, \x2, \x2, #8
335*4882a593Smuzhiyun	veor		\x7, \t1, \t5
336*4882a593Smuzhiyun	.ifb		\inv
337*4882a593Smuzhiyun	veor		\x2, \t0, \t4
338*4882a593Smuzhiyun	veor		\x4, \x4, \t3
339*4882a593Smuzhiyun	veor		\x5, \x5, \t7
340*4882a593Smuzhiyun	veor		\x3, \x3, \t6
341*4882a593Smuzhiyun	veor		\x6, \x6, \t2
342*4882a593Smuzhiyun	.else
343*4882a593Smuzhiyun	veor		\t3, \t3, \x4
344*4882a593Smuzhiyun	veor		\x5, \x5, \t7
345*4882a593Smuzhiyun	veor		\x2, \x3, \t6
346*4882a593Smuzhiyun	veor		\x3, \t0, \t4
347*4882a593Smuzhiyun	veor		\x4, \x6, \t2
348*4882a593Smuzhiyun	vmov		\x6, \t3
349*4882a593Smuzhiyun	.endif
350*4882a593Smuzhiyun	.endm
351*4882a593Smuzhiyun
352*4882a593Smuzhiyun	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
353*4882a593Smuzhiyun				      t0, t1, t2, t3, t4, t5, t6, t7
354*4882a593Smuzhiyun	vld1.8		{\t0-\t1}, [bskey, :256]!
355*4882a593Smuzhiyun	veor		\x0, \x0, \t0
356*4882a593Smuzhiyun	vld1.8		{\t2-\t3}, [bskey, :256]!
357*4882a593Smuzhiyun	veor		\x1, \x1, \t1
358*4882a593Smuzhiyun	vld1.8		{\t4-\t5}, [bskey, :256]!
359*4882a593Smuzhiyun	veor		\x2, \x2, \t2
360*4882a593Smuzhiyun	vld1.8		{\t6-\t7}, [bskey, :256]
361*4882a593Smuzhiyun	sub		bskey, bskey, #224
362*4882a593Smuzhiyun	veor		\x3, \x3, \t3
363*4882a593Smuzhiyun	veor		\x4, \x4, \t4
364*4882a593Smuzhiyun	veor		\x5, \x5, \t5
365*4882a593Smuzhiyun	veor		\x6, \x6, \t6
366*4882a593Smuzhiyun	veor		\x7, \x7, \t7
367*4882a593Smuzhiyun	vext.8		\t0, \x0, \x0, #8
368*4882a593Smuzhiyun	vext.8		\t6, \x6, \x6, #8
369*4882a593Smuzhiyun	vext.8		\t7, \x7, \x7, #8
370*4882a593Smuzhiyun	veor		\t0, \t0, \x0
371*4882a593Smuzhiyun	vext.8		\t1, \x1, \x1, #8
372*4882a593Smuzhiyun	veor		\t6, \t6, \x6
373*4882a593Smuzhiyun	vext.8		\t2, \x2, \x2, #8
374*4882a593Smuzhiyun	veor		\t7, \t7, \x7
375*4882a593Smuzhiyun	vext.8		\t3, \x3, \x3, #8
376*4882a593Smuzhiyun	veor		\t1, \t1, \x1
377*4882a593Smuzhiyun	vext.8		\t4, \x4, \x4, #8
378*4882a593Smuzhiyun	veor		\t2, \t2, \x2
379*4882a593Smuzhiyun	vext.8		\t5, \x5, \x5, #8
380*4882a593Smuzhiyun	veor		\t3, \t3, \x3
381*4882a593Smuzhiyun	veor		\t4, \t4, \x4
382*4882a593Smuzhiyun	veor		\t5, \t5, \x5
383*4882a593Smuzhiyun	veor		\x0, \x0, \t6
384*4882a593Smuzhiyun	veor		\x1, \x1, \t6
385*4882a593Smuzhiyun	veor		\x2, \x2, \t0
386*4882a593Smuzhiyun	veor		\x4, \x4, \t2
387*4882a593Smuzhiyun	veor		\x3, \x3, \t1
388*4882a593Smuzhiyun	veor		\x1, \x1, \t7
389*4882a593Smuzhiyun	veor		\x2, \x2, \t7
390*4882a593Smuzhiyun	veor		\x4, \x4, \t6
391*4882a593Smuzhiyun	veor		\x5, \x5, \t3
392*4882a593Smuzhiyun	veor		\x3, \x3, \t6
393*4882a593Smuzhiyun	veor		\x6, \x6, \t4
394*4882a593Smuzhiyun	veor		\x4, \x4, \t7
395*4882a593Smuzhiyun	veor		\x5, \x5, \t7
396*4882a593Smuzhiyun	veor		\x7, \x7, \t5
397*4882a593Smuzhiyun	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
398*4882a593Smuzhiyun			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
399*4882a593Smuzhiyun	.endm
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
402*4882a593Smuzhiyun	vshr.u64	\t0, \b0, #\n
403*4882a593Smuzhiyun	vshr.u64	\t1, \b1, #\n
404*4882a593Smuzhiyun	veor		\t0, \t0, \a0
405*4882a593Smuzhiyun	veor		\t1, \t1, \a1
406*4882a593Smuzhiyun	vand		\t0, \t0, \mask
407*4882a593Smuzhiyun	vand		\t1, \t1, \mask
408*4882a593Smuzhiyun	veor		\a0, \a0, \t0
409*4882a593Smuzhiyun	vshl.s64	\t0, \t0, #\n
410*4882a593Smuzhiyun	veor		\a1, \a1, \t1
411*4882a593Smuzhiyun	vshl.s64	\t1, \t1, #\n
412*4882a593Smuzhiyun	veor		\b0, \b0, \t0
413*4882a593Smuzhiyun	veor		\b1, \b1, \t1
414*4882a593Smuzhiyun	.endm
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
417*4882a593Smuzhiyun	vmov.i8		\t0, #0x55
418*4882a593Smuzhiyun	vmov.i8		\t1, #0x33
419*4882a593Smuzhiyun	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
420*4882a593Smuzhiyun	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
421*4882a593Smuzhiyun	vmov.i8		\t0, #0x0f
422*4882a593Smuzhiyun	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
423*4882a593Smuzhiyun	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
424*4882a593Smuzhiyun	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
425*4882a593Smuzhiyun	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
426*4882a593Smuzhiyun	.endm
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun	.align		4
429*4882a593SmuzhiyunM0:	.quad		0x02060a0e03070b0f, 0x0004080c0105090d
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun	/*
432*4882a593Smuzhiyun	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
433*4882a593Smuzhiyun	 */
434*4882a593SmuzhiyunENTRY(aesbs_convert_key)
435*4882a593Smuzhiyun	vld1.32		{q7}, [r1]!		// load round 0 key
436*4882a593Smuzhiyun	vld1.32		{q15}, [r1]!		// load round 1 key
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun	vmov.i8		q8,  #0x01		// bit masks
439*4882a593Smuzhiyun	vmov.i8		q9,  #0x02
440*4882a593Smuzhiyun	vmov.i8		q10, #0x04
441*4882a593Smuzhiyun	vmov.i8		q11, #0x08
442*4882a593Smuzhiyun	vmov.i8		q12, #0x10
443*4882a593Smuzhiyun	vmov.i8		q13, #0x20
444*4882a593Smuzhiyun	__ldr		q14, M0
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun	sub		r2, r2, #1
447*4882a593Smuzhiyun	vst1.8		{q7}, [r0, :128]!	// save round 0 key
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun.Lkey_loop:
450*4882a593Smuzhiyun	__tbl		q7, q15, q14
451*4882a593Smuzhiyun	vmov.i8		q6, #0x40
452*4882a593Smuzhiyun	vmov.i8		q15, #0x80
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun	vtst.8		q0, q7, q8
455*4882a593Smuzhiyun	vtst.8		q1, q7, q9
456*4882a593Smuzhiyun	vtst.8		q2, q7, q10
457*4882a593Smuzhiyun	vtst.8		q3, q7, q11
458*4882a593Smuzhiyun	vtst.8		q4, q7, q12
459*4882a593Smuzhiyun	vtst.8		q5, q7, q13
460*4882a593Smuzhiyun	vtst.8		q6, q7, q6
461*4882a593Smuzhiyun	vtst.8		q7, q7, q15
462*4882a593Smuzhiyun	vld1.32		{q15}, [r1]!		// load next round key
463*4882a593Smuzhiyun	vmvn		q0, q0
464*4882a593Smuzhiyun	vmvn		q1, q1
465*4882a593Smuzhiyun	vmvn		q5, q5
466*4882a593Smuzhiyun	vmvn		q6, q6
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun	subs		r2, r2, #1
469*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r0, :256]!
470*4882a593Smuzhiyun	vst1.8		{q2-q3}, [r0, :256]!
471*4882a593Smuzhiyun	vst1.8		{q4-q5}, [r0, :256]!
472*4882a593Smuzhiyun	vst1.8		{q6-q7}, [r0, :256]!
473*4882a593Smuzhiyun	bne		.Lkey_loop
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun	vmov.i8		q7, #0x63		// compose .L63
476*4882a593Smuzhiyun	veor		q15, q15, q7
477*4882a593Smuzhiyun	vst1.8		{q15}, [r0, :128]
478*4882a593Smuzhiyun	bx		lr
479*4882a593SmuzhiyunENDPROC(aesbs_convert_key)
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun	.align		4
482*4882a593SmuzhiyunM0SR:	.quad		0x0a0e02060f03070b, 0x0004080c05090d01
483*4882a593Smuzhiyun
484*4882a593Smuzhiyunaesbs_encrypt8:
485*4882a593Smuzhiyun	vld1.8		{q9}, [bskey, :128]!	// round 0 key
486*4882a593Smuzhiyun	__ldr		q8, M0SR
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun	veor		q10, q0, q9		// xor with round0 key
489*4882a593Smuzhiyun	veor		q11, q1, q9
490*4882a593Smuzhiyun	__tbl		q0, q10, q8
491*4882a593Smuzhiyun	veor		q12, q2, q9
492*4882a593Smuzhiyun	__tbl		q1, q11, q8
493*4882a593Smuzhiyun	veor		q13, q3, q9
494*4882a593Smuzhiyun	__tbl		q2, q12, q8
495*4882a593Smuzhiyun	veor		q14, q4, q9
496*4882a593Smuzhiyun	__tbl		q3, q13, q8
497*4882a593Smuzhiyun	veor		q15, q5, q9
498*4882a593Smuzhiyun	__tbl		q4, q14, q8
499*4882a593Smuzhiyun	veor		q10, q6, q9
500*4882a593Smuzhiyun	__tbl		q5, q15, q8
501*4882a593Smuzhiyun	veor		q11, q7, q9
502*4882a593Smuzhiyun	__tbl		q6, q10, q8
503*4882a593Smuzhiyun	__tbl		q7, q11, q8
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
506*4882a593Smuzhiyun
507*4882a593Smuzhiyun	sub		rounds, rounds, #1
508*4882a593Smuzhiyun	b		.Lenc_sbox
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun	.align		5
511*4882a593SmuzhiyunSR:	.quad		0x0504070600030201, 0x0f0e0d0c0a09080b
512*4882a593SmuzhiyunSRM0:	.quad		0x0304090e00050a0f, 0x01060b0c0207080d
513*4882a593Smuzhiyun
514*4882a593Smuzhiyun.Lenc_last:
515*4882a593Smuzhiyun	__ldr		q12, SRM0
516*4882a593Smuzhiyun.Lenc_loop:
517*4882a593Smuzhiyun	shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
518*4882a593Smuzhiyun.Lenc_sbox:
519*4882a593Smuzhiyun	sbox		q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
520*4882a593Smuzhiyun								q13, q14, q15
521*4882a593Smuzhiyun	subs		rounds, rounds, #1
522*4882a593Smuzhiyun	bcc		.Lenc_done
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun	mix_cols	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
525*4882a593Smuzhiyun								q13, q14, q15
526*4882a593Smuzhiyun
527*4882a593Smuzhiyun	beq		.Lenc_last
528*4882a593Smuzhiyun	__ldr		q12, SR
529*4882a593Smuzhiyun	b		.Lenc_loop
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun.Lenc_done:
532*4882a593Smuzhiyun	vld1.8		{q12}, [bskey, :128]	// last round key
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun	bitslice	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun	veor		q0, q0, q12
537*4882a593Smuzhiyun	veor		q1, q1, q12
538*4882a593Smuzhiyun	veor		q4, q4, q12
539*4882a593Smuzhiyun	veor		q6, q6, q12
540*4882a593Smuzhiyun	veor		q3, q3, q12
541*4882a593Smuzhiyun	veor		q7, q7, q12
542*4882a593Smuzhiyun	veor		q2, q2, q12
543*4882a593Smuzhiyun	veor		q5, q5, q12
544*4882a593Smuzhiyun	bx		lr
545*4882a593SmuzhiyunENDPROC(aesbs_encrypt8)
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun	.align		4
548*4882a593SmuzhiyunM0ISR:	.quad		0x0a0e0206070b0f03, 0x0004080c0d010509
549*4882a593Smuzhiyun
550*4882a593Smuzhiyunaesbs_decrypt8:
551*4882a593Smuzhiyun	add		bskey, bskey, rounds, lsl #7
552*4882a593Smuzhiyun	sub		bskey, bskey, #112
553*4882a593Smuzhiyun	vld1.8		{q9}, [bskey, :128]	// round 0 key
554*4882a593Smuzhiyun	sub		bskey, bskey, #128
555*4882a593Smuzhiyun	__ldr		q8, M0ISR
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun	veor		q10, q0, q9		// xor with round0 key
558*4882a593Smuzhiyun	veor		q11, q1, q9
559*4882a593Smuzhiyun	__tbl		q0, q10, q8
560*4882a593Smuzhiyun	veor		q12, q2, q9
561*4882a593Smuzhiyun	__tbl		q1, q11, q8
562*4882a593Smuzhiyun	veor		q13, q3, q9
563*4882a593Smuzhiyun	__tbl		q2, q12, q8
564*4882a593Smuzhiyun	veor		q14, q4, q9
565*4882a593Smuzhiyun	__tbl		q3, q13, q8
566*4882a593Smuzhiyun	veor		q15, q5, q9
567*4882a593Smuzhiyun	__tbl		q4, q14, q8
568*4882a593Smuzhiyun	veor		q10, q6, q9
569*4882a593Smuzhiyun	__tbl		q5, q15, q8
570*4882a593Smuzhiyun	veor		q11, q7, q9
571*4882a593Smuzhiyun	__tbl		q6, q10, q8
572*4882a593Smuzhiyun	__tbl		q7, q11, q8
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun	sub		rounds, rounds, #1
577*4882a593Smuzhiyun	b		.Ldec_sbox
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun	.align		5
580*4882a593SmuzhiyunISR:	.quad		0x0504070602010003, 0x0f0e0d0c080b0a09
581*4882a593SmuzhiyunISRM0:	.quad		0x01040b0e0205080f, 0x0306090c00070a0d
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun.Ldec_last:
584*4882a593Smuzhiyun	__ldr		q12, ISRM0
585*4882a593Smuzhiyun.Ldec_loop:
586*4882a593Smuzhiyun	inv_shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
587*4882a593Smuzhiyun.Ldec_sbox:
588*4882a593Smuzhiyun	inv_sbox	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
589*4882a593Smuzhiyun								q13, q14, q15
590*4882a593Smuzhiyun	subs		rounds, rounds, #1
591*4882a593Smuzhiyun	bcc		.Ldec_done
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun	inv_mix_cols	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
594*4882a593Smuzhiyun								q13, q14, q15
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun	beq		.Ldec_last
597*4882a593Smuzhiyun	__ldr		q12, ISR
598*4882a593Smuzhiyun	b		.Ldec_loop
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun.Ldec_done:
601*4882a593Smuzhiyun	add		bskey, bskey, #112
602*4882a593Smuzhiyun	vld1.8		{q12}, [bskey, :128]	// last round key
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun	bitslice	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun	veor		q0, q0, q12
607*4882a593Smuzhiyun	veor		q1, q1, q12
608*4882a593Smuzhiyun	veor		q6, q6, q12
609*4882a593Smuzhiyun	veor		q4, q4, q12
610*4882a593Smuzhiyun	veor		q2, q2, q12
611*4882a593Smuzhiyun	veor		q7, q7, q12
612*4882a593Smuzhiyun	veor		q3, q3, q12
613*4882a593Smuzhiyun	veor		q5, q5, q12
614*4882a593Smuzhiyun	bx		lr
615*4882a593SmuzhiyunENDPROC(aesbs_decrypt8)
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun	/*
618*4882a593Smuzhiyun	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
619*4882a593Smuzhiyun	 *		     int blocks)
620*4882a593Smuzhiyun	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
621*4882a593Smuzhiyun	 *		     int blocks)
622*4882a593Smuzhiyun	 */
623*4882a593Smuzhiyun	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
624*4882a593Smuzhiyun	push		{r4-r6, lr}
625*4882a593Smuzhiyun	ldr		r5, [sp, #16]		// number of blocks
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun99:	adr		ip, 0f
628*4882a593Smuzhiyun	and		lr, r5, #7
629*4882a593Smuzhiyun	cmp		r5, #8
630*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
631*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!
634*4882a593Smuzhiyun	vld1.8		{q1}, [r1]!
635*4882a593Smuzhiyun	vld1.8		{q2}, [r1]!
636*4882a593Smuzhiyun	vld1.8		{q3}, [r1]!
637*4882a593Smuzhiyun	vld1.8		{q4}, [r1]!
638*4882a593Smuzhiyun	vld1.8		{q5}, [r1]!
639*4882a593Smuzhiyun	vld1.8		{q6}, [r1]!
640*4882a593Smuzhiyun	vld1.8		{q7}, [r1]!
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun0:	mov		bskey, r2
643*4882a593Smuzhiyun	mov		rounds, r3
644*4882a593Smuzhiyun	bl		\do8
645*4882a593Smuzhiyun
646*4882a593Smuzhiyun	adr		ip, 1f
647*4882a593Smuzhiyun	and		lr, r5, #7
648*4882a593Smuzhiyun	cmp		r5, #8
649*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
650*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
651*4882a593Smuzhiyun
652*4882a593Smuzhiyun	vst1.8		{\o0}, [r0]!
653*4882a593Smuzhiyun	vst1.8		{\o1}, [r0]!
654*4882a593Smuzhiyun	vst1.8		{\o2}, [r0]!
655*4882a593Smuzhiyun	vst1.8		{\o3}, [r0]!
656*4882a593Smuzhiyun	vst1.8		{\o4}, [r0]!
657*4882a593Smuzhiyun	vst1.8		{\o5}, [r0]!
658*4882a593Smuzhiyun	vst1.8		{\o6}, [r0]!
659*4882a593Smuzhiyun	vst1.8		{\o7}, [r0]!
660*4882a593Smuzhiyun
661*4882a593Smuzhiyun1:	subs		r5, r5, #8
662*4882a593Smuzhiyun	bgt		99b
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun	pop		{r4-r6, pc}
665*4882a593Smuzhiyun	.endm
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun	.align		4
668*4882a593SmuzhiyunENTRY(aesbs_ecb_encrypt)
669*4882a593Smuzhiyun	__ecb_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
670*4882a593SmuzhiyunENDPROC(aesbs_ecb_encrypt)
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun	.align		4
673*4882a593SmuzhiyunENTRY(aesbs_ecb_decrypt)
674*4882a593Smuzhiyun	__ecb_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
675*4882a593SmuzhiyunENDPROC(aesbs_ecb_decrypt)
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun	/*
678*4882a593Smuzhiyun	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
679*4882a593Smuzhiyun	 *		     int rounds, int blocks, u8 iv[])
680*4882a593Smuzhiyun	 */
681*4882a593Smuzhiyun	.align		4
682*4882a593SmuzhiyunENTRY(aesbs_cbc_decrypt)
683*4882a593Smuzhiyun	mov		ip, sp
684*4882a593Smuzhiyun	push		{r4-r6, lr}
685*4882a593Smuzhiyun	ldm		ip, {r5-r6}		// load args 4-5
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun99:	adr		ip, 0f
688*4882a593Smuzhiyun	and		lr, r5, #7
689*4882a593Smuzhiyun	cmp		r5, #8
690*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
691*4882a593Smuzhiyun	mov		lr, r1
692*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
693*4882a593Smuzhiyun
694*4882a593Smuzhiyun	vld1.8		{q0}, [lr]!
695*4882a593Smuzhiyun	vld1.8		{q1}, [lr]!
696*4882a593Smuzhiyun	vld1.8		{q2}, [lr]!
697*4882a593Smuzhiyun	vld1.8		{q3}, [lr]!
698*4882a593Smuzhiyun	vld1.8		{q4}, [lr]!
699*4882a593Smuzhiyun	vld1.8		{q5}, [lr]!
700*4882a593Smuzhiyun	vld1.8		{q6}, [lr]!
701*4882a593Smuzhiyun	vld1.8		{q7}, [lr]
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun0:	mov		bskey, r2
704*4882a593Smuzhiyun	mov		rounds, r3
705*4882a593Smuzhiyun	bl		aesbs_decrypt8
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun	vld1.8		{q8}, [r6]
708*4882a593Smuzhiyun	vmov		q9, q8
709*4882a593Smuzhiyun	vmov		q10, q8
710*4882a593Smuzhiyun	vmov		q11, q8
711*4882a593Smuzhiyun	vmov		q12, q8
712*4882a593Smuzhiyun	vmov		q13, q8
713*4882a593Smuzhiyun	vmov		q14, q8
714*4882a593Smuzhiyun	vmov		q15, q8
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun	adr		ip, 1f
717*4882a593Smuzhiyun	and		lr, r5, #7
718*4882a593Smuzhiyun	cmp		r5, #8
719*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
720*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun	vld1.8		{q9}, [r1]!
723*4882a593Smuzhiyun	vld1.8		{q10}, [r1]!
724*4882a593Smuzhiyun	vld1.8		{q11}, [r1]!
725*4882a593Smuzhiyun	vld1.8		{q12}, [r1]!
726*4882a593Smuzhiyun	vld1.8		{q13}, [r1]!
727*4882a593Smuzhiyun	vld1.8		{q14}, [r1]!
728*4882a593Smuzhiyun	vld1.8		{q15}, [r1]!
729*4882a593Smuzhiyun	W(nop)
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun1:	adr		ip, 2f
732*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #3
733*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
734*4882a593Smuzhiyun
735*4882a593Smuzhiyun	veor		q0, q0, q8
736*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
737*4882a593Smuzhiyun	veor		q1, q1, q9
738*4882a593Smuzhiyun	vst1.8		{q1}, [r0]!
739*4882a593Smuzhiyun	veor		q6, q6, q10
740*4882a593Smuzhiyun	vst1.8		{q6}, [r0]!
741*4882a593Smuzhiyun	veor		q4, q4, q11
742*4882a593Smuzhiyun	vst1.8		{q4}, [r0]!
743*4882a593Smuzhiyun	veor		q2, q2, q12
744*4882a593Smuzhiyun	vst1.8		{q2}, [r0]!
745*4882a593Smuzhiyun	veor		q7, q7, q13
746*4882a593Smuzhiyun	vst1.8		{q7}, [r0]!
747*4882a593Smuzhiyun	veor		q3, q3, q14
748*4882a593Smuzhiyun	vst1.8		{q3}, [r0]!
749*4882a593Smuzhiyun	veor		q5, q5, q15
750*4882a593Smuzhiyun	vld1.8		{q8}, [r1]!		// load next round's iv
751*4882a593Smuzhiyun2:	vst1.8		{q5}, [r0]!
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun	subs		r5, r5, #8
754*4882a593Smuzhiyun	vst1.8		{q8}, [r6]		// store next round's iv
755*4882a593Smuzhiyun	bgt		99b
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun	pop		{r4-r6, pc}
758*4882a593SmuzhiyunENDPROC(aesbs_cbc_decrypt)
759*4882a593Smuzhiyun
760*4882a593Smuzhiyun	.macro		next_ctr, q
761*4882a593Smuzhiyun	vmov.32		\q\()h[1], r10
762*4882a593Smuzhiyun	adds		r10, r10, #1
763*4882a593Smuzhiyun	vmov.32		\q\()h[0], r9
764*4882a593Smuzhiyun	adcs		r9, r9, #0
765*4882a593Smuzhiyun	vmov.32		\q\()l[1], r8
766*4882a593Smuzhiyun	adcs		r8, r8, #0
767*4882a593Smuzhiyun	vmov.32		\q\()l[0], r7
768*4882a593Smuzhiyun	adc		r7, r7, #0
769*4882a593Smuzhiyun	vrev32.8	\q, \q
770*4882a593Smuzhiyun	.endm
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun	/*
773*4882a593Smuzhiyun	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
774*4882a593Smuzhiyun	 *		     int rounds, int blocks, u8 ctr[], u8 final[])
775*4882a593Smuzhiyun	 */
776*4882a593SmuzhiyunENTRY(aesbs_ctr_encrypt)
777*4882a593Smuzhiyun	mov		ip, sp
778*4882a593Smuzhiyun	push		{r4-r10, lr}
779*4882a593Smuzhiyun
780*4882a593Smuzhiyun	ldm		ip, {r5-r7}		// load args 4-6
781*4882a593Smuzhiyun	teq		r7, #0
782*4882a593Smuzhiyun	addne		r5, r5, #1		// one extra block if final != 0
783*4882a593Smuzhiyun
784*4882a593Smuzhiyun	vld1.8		{q0}, [r6]		// load counter
785*4882a593Smuzhiyun	vrev32.8	q1, q0
786*4882a593Smuzhiyun	vmov		r9, r10, d3
787*4882a593Smuzhiyun	vmov		r7, r8, d2
788*4882a593Smuzhiyun
789*4882a593Smuzhiyun	adds		r10, r10, #1
790*4882a593Smuzhiyun	adcs		r9, r9, #0
791*4882a593Smuzhiyun	adcs		r8, r8, #0
792*4882a593Smuzhiyun	adc		r7, r7, #0
793*4882a593Smuzhiyun
794*4882a593Smuzhiyun99:	vmov		q1, q0
795*4882a593Smuzhiyun	vmov		q2, q0
796*4882a593Smuzhiyun	vmov		q3, q0
797*4882a593Smuzhiyun	vmov		q4, q0
798*4882a593Smuzhiyun	vmov		q5, q0
799*4882a593Smuzhiyun	vmov		q6, q0
800*4882a593Smuzhiyun	vmov		q7, q0
801*4882a593Smuzhiyun
802*4882a593Smuzhiyun	adr		ip, 0f
803*4882a593Smuzhiyun	sub		lr, r5, #1
804*4882a593Smuzhiyun	and		lr, lr, #7
805*4882a593Smuzhiyun	cmp		r5, #8
806*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #5
807*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
808*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun	next_ctr	q1
811*4882a593Smuzhiyun	next_ctr	q2
812*4882a593Smuzhiyun	next_ctr	q3
813*4882a593Smuzhiyun	next_ctr	q4
814*4882a593Smuzhiyun	next_ctr	q5
815*4882a593Smuzhiyun	next_ctr	q6
816*4882a593Smuzhiyun	next_ctr	q7
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun0:	mov		bskey, r2
819*4882a593Smuzhiyun	mov		rounds, r3
820*4882a593Smuzhiyun	bl		aesbs_encrypt8
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun	adr		ip, 1f
823*4882a593Smuzhiyun	and		lr, r5, #7
824*4882a593Smuzhiyun	cmp		r5, #8
825*4882a593Smuzhiyun	movgt		r4, #0
826*4882a593Smuzhiyun	ldrle		r4, [sp, #40]		// load final in the last round
827*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
828*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun	vld1.8		{q8}, [r1]!
831*4882a593Smuzhiyun	vld1.8		{q9}, [r1]!
832*4882a593Smuzhiyun	vld1.8		{q10}, [r1]!
833*4882a593Smuzhiyun	vld1.8		{q11}, [r1]!
834*4882a593Smuzhiyun	vld1.8		{q12}, [r1]!
835*4882a593Smuzhiyun	vld1.8		{q13}, [r1]!
836*4882a593Smuzhiyun	vld1.8		{q14}, [r1]!
837*4882a593Smuzhiyun	teq		r4, #0			// skip last block if 'final'
838*4882a593Smuzhiyun1:	bne		2f
839*4882a593Smuzhiyun	vld1.8		{q15}, [r1]!
840*4882a593Smuzhiyun
841*4882a593Smuzhiyun2:	adr		ip, 3f
842*4882a593Smuzhiyun	cmp		r5, #8
843*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #3
844*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun	veor		q0, q0, q8
847*4882a593Smuzhiyun	vst1.8		{q0}, [r0]!
848*4882a593Smuzhiyun	veor		q1, q1, q9
849*4882a593Smuzhiyun	vst1.8		{q1}, [r0]!
850*4882a593Smuzhiyun	veor		q4, q4, q10
851*4882a593Smuzhiyun	vst1.8		{q4}, [r0]!
852*4882a593Smuzhiyun	veor		q6, q6, q11
853*4882a593Smuzhiyun	vst1.8		{q6}, [r0]!
854*4882a593Smuzhiyun	veor		q3, q3, q12
855*4882a593Smuzhiyun	vst1.8		{q3}, [r0]!
856*4882a593Smuzhiyun	veor		q7, q7, q13
857*4882a593Smuzhiyun	vst1.8		{q7}, [r0]!
858*4882a593Smuzhiyun	veor		q2, q2, q14
859*4882a593Smuzhiyun	vst1.8		{q2}, [r0]!
860*4882a593Smuzhiyun	teq		r4, #0			// skip last block if 'final'
861*4882a593Smuzhiyun	W(bne)		5f
862*4882a593Smuzhiyun3:	veor		q5, q5, q15
863*4882a593Smuzhiyun	vst1.8		{q5}, [r0]!
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun4:	next_ctr	q0
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun	subs		r5, r5, #8
868*4882a593Smuzhiyun	bgt		99b
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun	vst1.8		{q0}, [r6]
871*4882a593Smuzhiyun	pop		{r4-r10, pc}
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun5:	vst1.8		{q5}, [r4]
874*4882a593Smuzhiyun	b		4b
875*4882a593SmuzhiyunENDPROC(aesbs_ctr_encrypt)
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun	.macro		next_tweak, out, in, const, tmp
878*4882a593Smuzhiyun	vshr.s64	\tmp, \in, #63
879*4882a593Smuzhiyun	vand		\tmp, \tmp, \const
880*4882a593Smuzhiyun	vadd.u64	\out, \in, \in
881*4882a593Smuzhiyun	vext.8		\tmp, \tmp, \tmp, #8
882*4882a593Smuzhiyun	veor		\out, \out, \tmp
883*4882a593Smuzhiyun	.endm
884*4882a593Smuzhiyun
885*4882a593Smuzhiyun	/*
886*4882a593Smuzhiyun	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
887*4882a593Smuzhiyun	 *		     int blocks, u8 iv[], int reorder_last_tweak)
888*4882a593Smuzhiyun	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
889*4882a593Smuzhiyun	 *		     int blocks, u8 iv[], int reorder_last_tweak)
890*4882a593Smuzhiyun	 */
891*4882a593Smuzhiyun__xts_prepare8:
892*4882a593Smuzhiyun	vld1.8		{q14}, [r7]		// load iv
893*4882a593Smuzhiyun	vmov.i32	d30, #0x87		// compose tweak mask vector
894*4882a593Smuzhiyun	vmovl.u32	q15, d30
895*4882a593Smuzhiyun	vshr.u64	d30, d31, #7
896*4882a593Smuzhiyun	vmov		q12, q14
897*4882a593Smuzhiyun
898*4882a593Smuzhiyun	adr		ip, 0f
899*4882a593Smuzhiyun	and		r4, r6, #7
900*4882a593Smuzhiyun	cmp		r6, #8
901*4882a593Smuzhiyun	sub		ip, ip, r4, lsl #5
902*4882a593Smuzhiyun	mov		r4, sp
903*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
904*4882a593Smuzhiyun
905*4882a593Smuzhiyun	vld1.8		{q0}, [r1]!
906*4882a593Smuzhiyun	next_tweak	q12, q14, q15, q13
907*4882a593Smuzhiyun	veor		q0, q0, q14
908*4882a593Smuzhiyun	vst1.8		{q14}, [r4, :128]!
909*4882a593Smuzhiyun
910*4882a593Smuzhiyun	vld1.8		{q1}, [r1]!
911*4882a593Smuzhiyun	next_tweak	q14, q12, q15, q13
912*4882a593Smuzhiyun	veor		q1, q1, q12
913*4882a593Smuzhiyun	vst1.8		{q12}, [r4, :128]!
914*4882a593Smuzhiyun
915*4882a593Smuzhiyun	vld1.8		{q2}, [r1]!
916*4882a593Smuzhiyun	next_tweak	q12, q14, q15, q13
917*4882a593Smuzhiyun	veor		q2, q2, q14
918*4882a593Smuzhiyun	vst1.8		{q14}, [r4, :128]!
919*4882a593Smuzhiyun
920*4882a593Smuzhiyun	vld1.8		{q3}, [r1]!
921*4882a593Smuzhiyun	next_tweak	q14, q12, q15, q13
922*4882a593Smuzhiyun	veor		q3, q3, q12
923*4882a593Smuzhiyun	vst1.8		{q12}, [r4, :128]!
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun	vld1.8		{q4}, [r1]!
926*4882a593Smuzhiyun	next_tweak	q12, q14, q15, q13
927*4882a593Smuzhiyun	veor		q4, q4, q14
928*4882a593Smuzhiyun	vst1.8		{q14}, [r4, :128]!
929*4882a593Smuzhiyun
930*4882a593Smuzhiyun	vld1.8		{q5}, [r1]!
931*4882a593Smuzhiyun	next_tweak	q14, q12, q15, q13
932*4882a593Smuzhiyun	veor		q5, q5, q12
933*4882a593Smuzhiyun	vst1.8		{q12}, [r4, :128]!
934*4882a593Smuzhiyun
935*4882a593Smuzhiyun	vld1.8		{q6}, [r1]!
936*4882a593Smuzhiyun	next_tweak	q12, q14, q15, q13
937*4882a593Smuzhiyun	veor		q6, q6, q14
938*4882a593Smuzhiyun	vst1.8		{q14}, [r4, :128]!
939*4882a593Smuzhiyun
940*4882a593Smuzhiyun	vld1.8		{q7}, [r1]!
941*4882a593Smuzhiyun	next_tweak	q14, q12, q15, q13
942*4882a593SmuzhiyunTHUMB(	itt		le		)
943*4882a593Smuzhiyun	W(cmple)	r8, #0
944*4882a593Smuzhiyun	ble		1f
945*4882a593Smuzhiyun0:	veor		q7, q7, q12
946*4882a593Smuzhiyun	vst1.8		{q12}, [r4, :128]
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun	vst1.8		{q14}, [r7]		// store next iv
949*4882a593Smuzhiyun	bx		lr
950*4882a593Smuzhiyun
951*4882a593Smuzhiyun1:	vswp		q12, q14
952*4882a593Smuzhiyun	b		0b
953*4882a593SmuzhiyunENDPROC(__xts_prepare8)
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
956*4882a593Smuzhiyun	push		{r4-r8, lr}
957*4882a593Smuzhiyun	mov		r5, sp			// preserve sp
958*4882a593Smuzhiyun	ldrd		r6, r7, [sp, #24]	// get blocks and iv args
959*4882a593Smuzhiyun	rsb		r8, ip, #1
960*4882a593Smuzhiyun	sub		ip, sp, #128		// make room for 8x tweak
961*4882a593Smuzhiyun	bic		ip, ip, #0xf		// align sp to 16 bytes
962*4882a593Smuzhiyun	mov		sp, ip
963*4882a593Smuzhiyun
964*4882a593Smuzhiyun99:	bl		__xts_prepare8
965*4882a593Smuzhiyun
966*4882a593Smuzhiyun	mov		bskey, r2
967*4882a593Smuzhiyun	mov		rounds, r3
968*4882a593Smuzhiyun	bl		\do8
969*4882a593Smuzhiyun
970*4882a593Smuzhiyun	adr		ip, 0f
971*4882a593Smuzhiyun	and		lr, r6, #7
972*4882a593Smuzhiyun	cmp		r6, #8
973*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #2
974*4882a593Smuzhiyun	mov		r4, sp
975*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
976*4882a593Smuzhiyun
977*4882a593Smuzhiyun	vld1.8		{q8}, [r4, :128]!
978*4882a593Smuzhiyun	vld1.8		{q9}, [r4, :128]!
979*4882a593Smuzhiyun	vld1.8		{q10}, [r4, :128]!
980*4882a593Smuzhiyun	vld1.8		{q11}, [r4, :128]!
981*4882a593Smuzhiyun	vld1.8		{q12}, [r4, :128]!
982*4882a593Smuzhiyun	vld1.8		{q13}, [r4, :128]!
983*4882a593Smuzhiyun	vld1.8		{q14}, [r4, :128]!
984*4882a593Smuzhiyun	vld1.8		{q15}, [r4, :128]
985*4882a593Smuzhiyun
986*4882a593Smuzhiyun0:	adr		ip, 1f
987*4882a593Smuzhiyun	sub		ip, ip, lr, lsl #3
988*4882a593Smuzhiyun	movlt		pc, ip			// computed goto if blocks < 8
989*4882a593Smuzhiyun
990*4882a593Smuzhiyun	veor		\o0, \o0, q8
991*4882a593Smuzhiyun	vst1.8		{\o0}, [r0]!
992*4882a593Smuzhiyun	veor		\o1, \o1, q9
993*4882a593Smuzhiyun	vst1.8		{\o1}, [r0]!
994*4882a593Smuzhiyun	veor		\o2, \o2, q10
995*4882a593Smuzhiyun	vst1.8		{\o2}, [r0]!
996*4882a593Smuzhiyun	veor		\o3, \o3, q11
997*4882a593Smuzhiyun	vst1.8		{\o3}, [r0]!
998*4882a593Smuzhiyun	veor		\o4, \o4, q12
999*4882a593Smuzhiyun	vst1.8		{\o4}, [r0]!
1000*4882a593Smuzhiyun	veor		\o5, \o5, q13
1001*4882a593Smuzhiyun	vst1.8		{\o5}, [r0]!
1002*4882a593Smuzhiyun	veor		\o6, \o6, q14
1003*4882a593Smuzhiyun	vst1.8		{\o6}, [r0]!
1004*4882a593Smuzhiyun	veor		\o7, \o7, q15
1005*4882a593Smuzhiyun	vst1.8		{\o7}, [r0]!
1006*4882a593Smuzhiyun
1007*4882a593Smuzhiyun1:	subs		r6, r6, #8
1008*4882a593Smuzhiyun	bgt		99b
1009*4882a593Smuzhiyun
1010*4882a593Smuzhiyun	mov		sp, r5
1011*4882a593Smuzhiyun	pop		{r4-r8, pc}
1012*4882a593Smuzhiyun	.endm
1013*4882a593Smuzhiyun
1014*4882a593SmuzhiyunENTRY(aesbs_xts_encrypt)
1015*4882a593Smuzhiyun	mov		ip, #0			// never reorder final tweak
1016*4882a593Smuzhiyun	__xts_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1017*4882a593SmuzhiyunENDPROC(aesbs_xts_encrypt)
1018*4882a593Smuzhiyun
1019*4882a593SmuzhiyunENTRY(aesbs_xts_decrypt)
1020*4882a593Smuzhiyun	ldr		ip, [sp, #8]		// reorder final tweak?
1021*4882a593Smuzhiyun	__xts_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1022*4882a593SmuzhiyunENDPROC(aesbs_xts_decrypt)
1023