xref: /OK3568_Linux_fs/kernel/arch/arm64/crypto/aes-neonbs-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Bit sliced AES using NEON instructions
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun/*
9*4882a593Smuzhiyun * The algorithm implemented here is described in detail by the paper
10*4882a593Smuzhiyun * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11*4882a593Smuzhiyun * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * This implementation is based primarily on the OpenSSL implementation
14*4882a593Smuzhiyun * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15*4882a593Smuzhiyun */
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun#include <linux/linkage.h>
18*4882a593Smuzhiyun#include <asm/assembler.h>
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun	.text
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun	rounds		.req	x11
23*4882a593Smuzhiyun	bskey		.req	x12
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
26*4882a593Smuzhiyun	eor		\b2, \b2, \b1
27*4882a593Smuzhiyun	eor		\b5, \b5, \b6
28*4882a593Smuzhiyun	eor		\b3, \b3, \b0
29*4882a593Smuzhiyun	eor		\b6, \b6, \b2
30*4882a593Smuzhiyun	eor		\b5, \b5, \b0
31*4882a593Smuzhiyun	eor		\b6, \b6, \b3
32*4882a593Smuzhiyun	eor		\b3, \b3, \b7
33*4882a593Smuzhiyun	eor		\b7, \b7, \b5
34*4882a593Smuzhiyun	eor		\b3, \b3, \b4
35*4882a593Smuzhiyun	eor		\b4, \b4, \b5
36*4882a593Smuzhiyun	eor		\b2, \b2, \b7
37*4882a593Smuzhiyun	eor		\b3, \b3, \b1
38*4882a593Smuzhiyun	eor		\b1, \b1, \b5
39*4882a593Smuzhiyun	.endm
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
42*4882a593Smuzhiyun	eor		\b0, \b0, \b6
43*4882a593Smuzhiyun	eor		\b1, \b1, \b4
44*4882a593Smuzhiyun	eor		\b4, \b4, \b6
45*4882a593Smuzhiyun	eor		\b2, \b2, \b0
46*4882a593Smuzhiyun	eor		\b6, \b6, \b1
47*4882a593Smuzhiyun	eor		\b1, \b1, \b5
48*4882a593Smuzhiyun	eor		\b5, \b5, \b3
49*4882a593Smuzhiyun	eor		\b3, \b3, \b7
50*4882a593Smuzhiyun	eor		\b7, \b7, \b5
51*4882a593Smuzhiyun	eor		\b2, \b2, \b5
52*4882a593Smuzhiyun	eor		\b4, \b4, \b7
53*4882a593Smuzhiyun	.endm
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
56*4882a593Smuzhiyun	eor		\b1, \b1, \b7
57*4882a593Smuzhiyun	eor		\b4, \b4, \b7
58*4882a593Smuzhiyun	eor		\b7, \b7, \b5
59*4882a593Smuzhiyun	eor		\b1, \b1, \b3
60*4882a593Smuzhiyun	eor		\b2, \b2, \b5
61*4882a593Smuzhiyun	eor		\b3, \b3, \b7
62*4882a593Smuzhiyun	eor		\b6, \b6, \b1
63*4882a593Smuzhiyun	eor		\b2, \b2, \b0
64*4882a593Smuzhiyun	eor		\b5, \b5, \b3
65*4882a593Smuzhiyun	eor		\b4, \b4, \b6
66*4882a593Smuzhiyun	eor		\b0, \b0, \b6
67*4882a593Smuzhiyun	eor		\b1, \b1, \b4
68*4882a593Smuzhiyun	.endm
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
71*4882a593Smuzhiyun	eor		\b1, \b1, \b5
72*4882a593Smuzhiyun	eor		\b2, \b2, \b7
73*4882a593Smuzhiyun	eor		\b3, \b3, \b1
74*4882a593Smuzhiyun	eor		\b4, \b4, \b5
75*4882a593Smuzhiyun	eor		\b7, \b7, \b5
76*4882a593Smuzhiyun	eor		\b3, \b3, \b4
77*4882a593Smuzhiyun	eor 		\b5, \b5, \b0
78*4882a593Smuzhiyun	eor		\b3, \b3, \b7
79*4882a593Smuzhiyun	eor		\b6, \b6, \b2
80*4882a593Smuzhiyun	eor		\b2, \b2, \b1
81*4882a593Smuzhiyun	eor		\b6, \b6, \b3
82*4882a593Smuzhiyun	eor		\b3, \b3, \b0
83*4882a593Smuzhiyun	eor		\b5, \b5, \b6
84*4882a593Smuzhiyun	.endm
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
87*4882a593Smuzhiyun	eor 		\t0, \y0, \y1
88*4882a593Smuzhiyun	and		\t0, \t0, \x0
89*4882a593Smuzhiyun	eor		\x0, \x0, \x1
90*4882a593Smuzhiyun	and		\t1, \x1, \y0
91*4882a593Smuzhiyun	and		\x0, \x0, \y1
92*4882a593Smuzhiyun	eor		\x1, \t1, \t0
93*4882a593Smuzhiyun	eor		\x0, \x0, \t1
94*4882a593Smuzhiyun	.endm
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
97*4882a593Smuzhiyun	eor		\t0, \y0, \y1
98*4882a593Smuzhiyun	eor 		\t1, \y2, \y3
99*4882a593Smuzhiyun	and		\t0, \t0, \x0
100*4882a593Smuzhiyun	and		\t1, \t1, \x2
101*4882a593Smuzhiyun	eor		\x0, \x0, \x1
102*4882a593Smuzhiyun	eor		\x2, \x2, \x3
103*4882a593Smuzhiyun	and		\x1, \x1, \y0
104*4882a593Smuzhiyun	and		\x3, \x3, \y2
105*4882a593Smuzhiyun	and		\x0, \x0, \y1
106*4882a593Smuzhiyun	and		\x2, \x2, \y3
107*4882a593Smuzhiyun	eor		\x1, \x1, \x0
108*4882a593Smuzhiyun	eor		\x2, \x2, \x3
109*4882a593Smuzhiyun	eor		\x0, \x0, \t0
110*4882a593Smuzhiyun	eor		\x3, \x3, \t1
111*4882a593Smuzhiyun	.endm
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114*4882a593Smuzhiyun				    y0, y1, y2, y3, t0, t1, t2, t3
115*4882a593Smuzhiyun	eor		\t0, \x0, \x2
116*4882a593Smuzhiyun	eor		\t1, \x1, \x3
117*4882a593Smuzhiyun	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
118*4882a593Smuzhiyun	eor		\y0, \y0, \y2
119*4882a593Smuzhiyun	eor		\y1, \y1, \y3
120*4882a593Smuzhiyun	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
121*4882a593Smuzhiyun	eor		\x0, \x0, \t0
122*4882a593Smuzhiyun	eor		\x2, \x2, \t0
123*4882a593Smuzhiyun	eor		\x1, \x1, \t1
124*4882a593Smuzhiyun	eor		\x3, \x3, \t1
125*4882a593Smuzhiyun	eor		\t0, \x4, \x6
126*4882a593Smuzhiyun	eor		\t1, \x5, \x7
127*4882a593Smuzhiyun	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
128*4882a593Smuzhiyun	eor		\y0, \y0, \y2
129*4882a593Smuzhiyun	eor		\y1, \y1, \y3
130*4882a593Smuzhiyun	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
131*4882a593Smuzhiyun	eor		\x4, \x4, \t0
132*4882a593Smuzhiyun	eor		\x6, \x6, \t0
133*4882a593Smuzhiyun	eor		\x5, \x5, \t1
134*4882a593Smuzhiyun	eor		\x7, \x7, \t1
135*4882a593Smuzhiyun	.endm
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138*4882a593Smuzhiyun				   t0, t1, t2, t3, s0, s1, s2, s3
139*4882a593Smuzhiyun	eor		\t3, \x4, \x6
140*4882a593Smuzhiyun	eor		\t0, \x5, \x7
141*4882a593Smuzhiyun	eor		\t1, \x1, \x3
142*4882a593Smuzhiyun	eor		\s1, \x7, \x6
143*4882a593Smuzhiyun	eor		\s0, \x0, \x2
144*4882a593Smuzhiyun	eor		\s3, \t3, \t0
145*4882a593Smuzhiyun	orr		\t2, \t0, \t1
146*4882a593Smuzhiyun	and		\s2, \t3, \s0
147*4882a593Smuzhiyun	orr		\t3, \t3, \s0
148*4882a593Smuzhiyun	eor		\s0, \s0, \t1
149*4882a593Smuzhiyun	and		\t0, \t0, \t1
150*4882a593Smuzhiyun	eor		\t1, \x3, \x2
151*4882a593Smuzhiyun	and		\s3, \s3, \s0
152*4882a593Smuzhiyun	and		\s1, \s1, \t1
153*4882a593Smuzhiyun	eor		\t1, \x4, \x5
154*4882a593Smuzhiyun	eor		\s0, \x1, \x0
155*4882a593Smuzhiyun	eor		\t3, \t3, \s1
156*4882a593Smuzhiyun	eor		\t2, \t2, \s1
157*4882a593Smuzhiyun	and		\s1, \t1, \s0
158*4882a593Smuzhiyun	orr		\t1, \t1, \s0
159*4882a593Smuzhiyun	eor		\t3, \t3, \s3
160*4882a593Smuzhiyun	eor		\t0, \t0, \s1
161*4882a593Smuzhiyun	eor		\t2, \t2, \s2
162*4882a593Smuzhiyun	eor		\t1, \t1, \s3
163*4882a593Smuzhiyun	eor		\t0, \t0, \s2
164*4882a593Smuzhiyun	and		\s0, \x7, \x3
165*4882a593Smuzhiyun	eor		\t1, \t1, \s2
166*4882a593Smuzhiyun	and		\s1, \x6, \x2
167*4882a593Smuzhiyun	and		\s2, \x5, \x1
168*4882a593Smuzhiyun	orr		\s3, \x4, \x0
169*4882a593Smuzhiyun	eor		\t3, \t3, \s0
170*4882a593Smuzhiyun	eor		\t1, \t1, \s2
171*4882a593Smuzhiyun	eor		\s0, \t0, \s3
172*4882a593Smuzhiyun	eor		\t2, \t2, \s1
173*4882a593Smuzhiyun	and		\s2, \t3, \t1
174*4882a593Smuzhiyun	eor		\s1, \t2, \s2
175*4882a593Smuzhiyun	eor		\s3, \s0, \s2
176*4882a593Smuzhiyun	bsl		\s1, \t1, \s0
177*4882a593Smuzhiyun	not		\t0, \s0
178*4882a593Smuzhiyun	bsl		\s0, \s1, \s3
179*4882a593Smuzhiyun	bsl		\t0, \s1, \s3
180*4882a593Smuzhiyun	bsl		\s3, \t3, \t2
181*4882a593Smuzhiyun	eor		\t3, \t3, \t2
182*4882a593Smuzhiyun	and		\s2, \s0, \s3
183*4882a593Smuzhiyun	eor		\t1, \t1, \t0
184*4882a593Smuzhiyun	eor		\s2, \s2, \t3
185*4882a593Smuzhiyun	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186*4882a593Smuzhiyun			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
187*4882a593Smuzhiyun	.endm
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190*4882a593Smuzhiyun			      t0, t1, t2, t3, s0, s1, s2, s3
191*4882a593Smuzhiyun	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192*4882a593Smuzhiyun			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193*4882a593Smuzhiyun	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194*4882a593Smuzhiyun			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195*4882a593Smuzhiyun			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196*4882a593Smuzhiyun			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197*4882a593Smuzhiyun	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198*4882a593Smuzhiyun			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
199*4882a593Smuzhiyun	.endm
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202*4882a593Smuzhiyun				  t0, t1, t2, t3, s0, s1, s2, s3
203*4882a593Smuzhiyun	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204*4882a593Smuzhiyun			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205*4882a593Smuzhiyun	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206*4882a593Smuzhiyun			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207*4882a593Smuzhiyun			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208*4882a593Smuzhiyun			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209*4882a593Smuzhiyun	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210*4882a593Smuzhiyun			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
211*4882a593Smuzhiyun	.endm
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun	.macro		enc_next_rk
214*4882a593Smuzhiyun	ldp		q16, q17, [bskey], #128
215*4882a593Smuzhiyun	ldp		q18, q19, [bskey, #-96]
216*4882a593Smuzhiyun	ldp		q20, q21, [bskey, #-64]
217*4882a593Smuzhiyun	ldp		q22, q23, [bskey, #-32]
218*4882a593Smuzhiyun	.endm
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun	.macro		dec_next_rk
221*4882a593Smuzhiyun	ldp		q16, q17, [bskey, #-128]!
222*4882a593Smuzhiyun	ldp		q18, q19, [bskey, #32]
223*4882a593Smuzhiyun	ldp		q20, q21, [bskey, #64]
224*4882a593Smuzhiyun	ldp		q22, q23, [bskey, #96]
225*4882a593Smuzhiyun	.endm
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228*4882a593Smuzhiyun	eor		\x0\().16b, \x0\().16b, v16.16b
229*4882a593Smuzhiyun	eor		\x1\().16b, \x1\().16b, v17.16b
230*4882a593Smuzhiyun	eor		\x2\().16b, \x2\().16b, v18.16b
231*4882a593Smuzhiyun	eor		\x3\().16b, \x3\().16b, v19.16b
232*4882a593Smuzhiyun	eor		\x4\().16b, \x4\().16b, v20.16b
233*4882a593Smuzhiyun	eor		\x5\().16b, \x5\().16b, v21.16b
234*4882a593Smuzhiyun	eor		\x6\().16b, \x6\().16b, v22.16b
235*4882a593Smuzhiyun	eor		\x7\().16b, \x7\().16b, v23.16b
236*4882a593Smuzhiyun	.endm
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239*4882a593Smuzhiyun	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
240*4882a593Smuzhiyun	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
241*4882a593Smuzhiyun	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
242*4882a593Smuzhiyun	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
243*4882a593Smuzhiyun	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
244*4882a593Smuzhiyun	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
245*4882a593Smuzhiyun	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
246*4882a593Smuzhiyun	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
247*4882a593Smuzhiyun	.endm
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250*4882a593Smuzhiyun				  t0, t1, t2, t3, t4, t5, t6, t7, inv
251*4882a593Smuzhiyun	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
252*4882a593Smuzhiyun	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
253*4882a593Smuzhiyun	eor		\x0\().16b, \x0\().16b, \t0\().16b
254*4882a593Smuzhiyun	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
255*4882a593Smuzhiyun	eor		\x1\().16b, \x1\().16b, \t1\().16b
256*4882a593Smuzhiyun	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
257*4882a593Smuzhiyun	eor		\x2\().16b, \x2\().16b, \t2\().16b
258*4882a593Smuzhiyun	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
259*4882a593Smuzhiyun	eor		\x3\().16b, \x3\().16b, \t3\().16b
260*4882a593Smuzhiyun	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
261*4882a593Smuzhiyun	eor		\x4\().16b, \x4\().16b, \t4\().16b
262*4882a593Smuzhiyun	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
263*4882a593Smuzhiyun	eor		\x5\().16b, \x5\().16b, \t5\().16b
264*4882a593Smuzhiyun	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
265*4882a593Smuzhiyun	eor		\x6\().16b, \x6\().16b, \t6\().16b
266*4882a593Smuzhiyun	eor		\t1\().16b, \t1\().16b, \x0\().16b
267*4882a593Smuzhiyun	eor		\x7\().16b, \x7\().16b, \t7\().16b
268*4882a593Smuzhiyun	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
269*4882a593Smuzhiyun	eor		\t2\().16b, \t2\().16b, \x1\().16b
270*4882a593Smuzhiyun	eor		\t0\().16b, \t0\().16b, \x7\().16b
271*4882a593Smuzhiyun	eor		\t1\().16b, \t1\().16b, \x7\().16b
272*4882a593Smuzhiyun	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
273*4882a593Smuzhiyun	eor		\t5\().16b, \t5\().16b, \x4\().16b
274*4882a593Smuzhiyun	eor		\x0\().16b, \x0\().16b, \t0\().16b
275*4882a593Smuzhiyun	eor		\t6\().16b, \t6\().16b, \x5\().16b
276*4882a593Smuzhiyun	eor		\x1\().16b, \x1\().16b, \t1\().16b
277*4882a593Smuzhiyun	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
278*4882a593Smuzhiyun	eor		\t4\().16b, \t4\().16b, \x3\().16b
279*4882a593Smuzhiyun	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
280*4882a593Smuzhiyun	eor		\t7\().16b, \t7\().16b, \x6\().16b
281*4882a593Smuzhiyun	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
282*4882a593Smuzhiyun	eor		\t3\().16b, \t3\().16b, \x2\().16b
283*4882a593Smuzhiyun	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
284*4882a593Smuzhiyun	eor		\t4\().16b, \t4\().16b, \x7\().16b
285*4882a593Smuzhiyun	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
286*4882a593Smuzhiyun	eor		\t3\().16b, \t3\().16b, \x7\().16b
287*4882a593Smuzhiyun	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
288*4882a593Smuzhiyun	eor		\x7\().16b, \t1\().16b, \t5\().16b
289*4882a593Smuzhiyun	.ifb		\inv
290*4882a593Smuzhiyun	eor		\x2\().16b, \t0\().16b, \t4\().16b
291*4882a593Smuzhiyun	eor		\x4\().16b, \x4\().16b, \t3\().16b
292*4882a593Smuzhiyun	eor		\x5\().16b, \x5\().16b, \t7\().16b
293*4882a593Smuzhiyun	eor		\x3\().16b, \x3\().16b, \t6\().16b
294*4882a593Smuzhiyun	eor		\x6\().16b, \x6\().16b, \t2\().16b
295*4882a593Smuzhiyun	.else
296*4882a593Smuzhiyun	eor		\t3\().16b, \t3\().16b, \x4\().16b
297*4882a593Smuzhiyun	eor		\x5\().16b, \x5\().16b, \t7\().16b
298*4882a593Smuzhiyun	eor		\x2\().16b, \x3\().16b, \t6\().16b
299*4882a593Smuzhiyun	eor		\x3\().16b, \t0\().16b, \t4\().16b
300*4882a593Smuzhiyun	eor		\x4\().16b, \x6\().16b, \t2\().16b
301*4882a593Smuzhiyun	mov		\x6\().16b, \t3\().16b
302*4882a593Smuzhiyun	.endif
303*4882a593Smuzhiyun	.endm
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306*4882a593Smuzhiyun				      t0, t1, t2, t3, t4, t5, t6, t7
307*4882a593Smuzhiyun	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
308*4882a593Smuzhiyun	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
309*4882a593Smuzhiyun	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
310*4882a593Smuzhiyun	eor		\t0\().16b, \t0\().16b, \x0\().16b
311*4882a593Smuzhiyun	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
312*4882a593Smuzhiyun	eor		\t6\().16b, \t6\().16b, \x6\().16b
313*4882a593Smuzhiyun	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
314*4882a593Smuzhiyun	eor		\t7\().16b, \t7\().16b, \x7\().16b
315*4882a593Smuzhiyun	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
316*4882a593Smuzhiyun	eor		\t1\().16b, \t1\().16b, \x1\().16b
317*4882a593Smuzhiyun	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
318*4882a593Smuzhiyun	eor		\t2\().16b, \t2\().16b, \x2\().16b
319*4882a593Smuzhiyun	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
320*4882a593Smuzhiyun	eor		\t3\().16b, \t3\().16b, \x3\().16b
321*4882a593Smuzhiyun	eor		\t4\().16b, \t4\().16b, \x4\().16b
322*4882a593Smuzhiyun	eor		\t5\().16b, \t5\().16b, \x5\().16b
323*4882a593Smuzhiyun	eor		\x0\().16b, \x0\().16b, \t6\().16b
324*4882a593Smuzhiyun	eor		\x1\().16b, \x1\().16b, \t6\().16b
325*4882a593Smuzhiyun	eor		\x2\().16b, \x2\().16b, \t0\().16b
326*4882a593Smuzhiyun	eor		\x4\().16b, \x4\().16b, \t2\().16b
327*4882a593Smuzhiyun	eor		\x3\().16b, \x3\().16b, \t1\().16b
328*4882a593Smuzhiyun	eor		\x1\().16b, \x1\().16b, \t7\().16b
329*4882a593Smuzhiyun	eor		\x2\().16b, \x2\().16b, \t7\().16b
330*4882a593Smuzhiyun	eor		\x4\().16b, \x4\().16b, \t6\().16b
331*4882a593Smuzhiyun	eor		\x5\().16b, \x5\().16b, \t3\().16b
332*4882a593Smuzhiyun	eor		\x3\().16b, \x3\().16b, \t6\().16b
333*4882a593Smuzhiyun	eor		\x6\().16b, \x6\().16b, \t4\().16b
334*4882a593Smuzhiyun	eor		\x4\().16b, \x4\().16b, \t7\().16b
335*4882a593Smuzhiyun	eor		\x5\().16b, \x5\().16b, \t7\().16b
336*4882a593Smuzhiyun	eor		\x7\().16b, \x7\().16b, \t5\().16b
337*4882a593Smuzhiyun	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338*4882a593Smuzhiyun			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
339*4882a593Smuzhiyun	.endm
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342*4882a593Smuzhiyun	ushr		\t0\().2d, \b0\().2d, #\n
343*4882a593Smuzhiyun	ushr		\t1\().2d, \b1\().2d, #\n
344*4882a593Smuzhiyun	eor		\t0\().16b, \t0\().16b, \a0\().16b
345*4882a593Smuzhiyun	eor		\t1\().16b, \t1\().16b, \a1\().16b
346*4882a593Smuzhiyun	and		\t0\().16b, \t0\().16b, \mask\().16b
347*4882a593Smuzhiyun	and		\t1\().16b, \t1\().16b, \mask\().16b
348*4882a593Smuzhiyun	eor		\a0\().16b, \a0\().16b, \t0\().16b
349*4882a593Smuzhiyun	shl		\t0\().2d, \t0\().2d, #\n
350*4882a593Smuzhiyun	eor		\a1\().16b, \a1\().16b, \t1\().16b
351*4882a593Smuzhiyun	shl		\t1\().2d, \t1\().2d, #\n
352*4882a593Smuzhiyun	eor		\b0\().16b, \b0\().16b, \t0\().16b
353*4882a593Smuzhiyun	eor		\b1\().16b, \b1\().16b, \t1\().16b
354*4882a593Smuzhiyun	.endm
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357*4882a593Smuzhiyun	movi		\t0\().16b, #0x55
358*4882a593Smuzhiyun	movi		\t1\().16b, #0x33
359*4882a593Smuzhiyun	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360*4882a593Smuzhiyun	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361*4882a593Smuzhiyun	movi		\t0\().16b, #0x0f
362*4882a593Smuzhiyun	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363*4882a593Smuzhiyun	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364*4882a593Smuzhiyun	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365*4882a593Smuzhiyun	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
366*4882a593Smuzhiyun	.endm
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun	.align		6
370*4882a593SmuzhiyunM0:	.octa		0x0004080c0105090d02060a0e03070b0f
371*4882a593Smuzhiyun
372*4882a593SmuzhiyunM0SR:	.octa		0x0004080c05090d010a0e02060f03070b
373*4882a593SmuzhiyunSR:	.octa		0x0f0e0d0c0a09080b0504070600030201
374*4882a593SmuzhiyunSRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
375*4882a593Smuzhiyun
376*4882a593SmuzhiyunM0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
377*4882a593SmuzhiyunISR:	.octa		0x0f0e0d0c080b0a090504070602010003
378*4882a593SmuzhiyunISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun	/*
381*4882a593Smuzhiyun	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
382*4882a593Smuzhiyun	 */
383*4882a593SmuzhiyunSYM_FUNC_START(aesbs_convert_key)
384*4882a593Smuzhiyun	ld1		{v7.4s}, [x1], #16		// load round 0 key
385*4882a593Smuzhiyun	ld1		{v17.4s}, [x1], #16		// load round 1 key
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun	movi		v8.16b,  #0x01			// bit masks
388*4882a593Smuzhiyun	movi		v9.16b,  #0x02
389*4882a593Smuzhiyun	movi		v10.16b, #0x04
390*4882a593Smuzhiyun	movi		v11.16b, #0x08
391*4882a593Smuzhiyun	movi		v12.16b, #0x10
392*4882a593Smuzhiyun	movi		v13.16b, #0x20
393*4882a593Smuzhiyun	movi		v14.16b, #0x40
394*4882a593Smuzhiyun	movi		v15.16b, #0x80
395*4882a593Smuzhiyun	ldr		q16, M0
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun	sub		x2, x2, #1
398*4882a593Smuzhiyun	str		q7, [x0], #16		// save round 0 key
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun.Lkey_loop:
401*4882a593Smuzhiyun	tbl		v7.16b ,{v17.16b}, v16.16b
402*4882a593Smuzhiyun	ld1		{v17.4s}, [x1], #16		// load next round key
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun	cmtst		v0.16b, v7.16b, v8.16b
405*4882a593Smuzhiyun	cmtst		v1.16b, v7.16b, v9.16b
406*4882a593Smuzhiyun	cmtst		v2.16b, v7.16b, v10.16b
407*4882a593Smuzhiyun	cmtst		v3.16b, v7.16b, v11.16b
408*4882a593Smuzhiyun	cmtst		v4.16b, v7.16b, v12.16b
409*4882a593Smuzhiyun	cmtst		v5.16b, v7.16b, v13.16b
410*4882a593Smuzhiyun	cmtst		v6.16b, v7.16b, v14.16b
411*4882a593Smuzhiyun	cmtst		v7.16b, v7.16b, v15.16b
412*4882a593Smuzhiyun	not		v0.16b, v0.16b
413*4882a593Smuzhiyun	not		v1.16b, v1.16b
414*4882a593Smuzhiyun	not		v5.16b, v5.16b
415*4882a593Smuzhiyun	not		v6.16b, v6.16b
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun	subs		x2, x2, #1
418*4882a593Smuzhiyun	stp		q0, q1, [x0], #128
419*4882a593Smuzhiyun	stp		q2, q3, [x0, #-96]
420*4882a593Smuzhiyun	stp		q4, q5, [x0, #-64]
421*4882a593Smuzhiyun	stp		q6, q7, [x0, #-32]
422*4882a593Smuzhiyun	b.ne		.Lkey_loop
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun	movi		v7.16b, #0x63			// compose .L63
425*4882a593Smuzhiyun	eor		v17.16b, v17.16b, v7.16b
426*4882a593Smuzhiyun	str		q17, [x0]
427*4882a593Smuzhiyun	ret
428*4882a593SmuzhiyunSYM_FUNC_END(aesbs_convert_key)
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun	.align		4
431*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aesbs_encrypt8)
432*4882a593Smuzhiyun	ldr		q9, [bskey], #16		// round 0 key
433*4882a593Smuzhiyun	ldr		q8, M0SR
434*4882a593Smuzhiyun	ldr		q24, SR
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
437*4882a593Smuzhiyun	eor		v11.16b, v1.16b, v9.16b
438*4882a593Smuzhiyun	tbl		v0.16b, {v10.16b}, v8.16b
439*4882a593Smuzhiyun	eor		v12.16b, v2.16b, v9.16b
440*4882a593Smuzhiyun	tbl		v1.16b, {v11.16b}, v8.16b
441*4882a593Smuzhiyun	eor		v13.16b, v3.16b, v9.16b
442*4882a593Smuzhiyun	tbl		v2.16b, {v12.16b}, v8.16b
443*4882a593Smuzhiyun	eor		v14.16b, v4.16b, v9.16b
444*4882a593Smuzhiyun	tbl		v3.16b, {v13.16b}, v8.16b
445*4882a593Smuzhiyun	eor		v15.16b, v5.16b, v9.16b
446*4882a593Smuzhiyun	tbl		v4.16b, {v14.16b}, v8.16b
447*4882a593Smuzhiyun	eor		v10.16b, v6.16b, v9.16b
448*4882a593Smuzhiyun	tbl		v5.16b, {v15.16b}, v8.16b
449*4882a593Smuzhiyun	eor		v11.16b, v7.16b, v9.16b
450*4882a593Smuzhiyun	tbl		v6.16b, {v10.16b}, v8.16b
451*4882a593Smuzhiyun	tbl		v7.16b, {v11.16b}, v8.16b
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun	sub		rounds, rounds, #1
456*4882a593Smuzhiyun	b		.Lenc_sbox
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun.Lenc_loop:
459*4882a593Smuzhiyun	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
460*4882a593Smuzhiyun.Lenc_sbox:
461*4882a593Smuzhiyun	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
462*4882a593Smuzhiyun								v13, v14, v15
463*4882a593Smuzhiyun	subs		rounds, rounds, #1
464*4882a593Smuzhiyun	b.cc		.Lenc_done
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun	enc_next_rk
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
469*4882a593Smuzhiyun								v13, v14, v15
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
472*4882a593Smuzhiyun
473*4882a593Smuzhiyun	b.ne		.Lenc_loop
474*4882a593Smuzhiyun	ldr		q24, SRM0
475*4882a593Smuzhiyun	b		.Lenc_loop
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun.Lenc_done:
478*4882a593Smuzhiyun	ldr		q12, [bskey]			// last round key
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v12.16b
483*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v12.16b
484*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v12.16b
485*4882a593Smuzhiyun	eor		v6.16b, v6.16b, v12.16b
486*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v12.16b
487*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v12.16b
488*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v12.16b
489*4882a593Smuzhiyun	eor		v5.16b, v5.16b, v12.16b
490*4882a593Smuzhiyun	ret
491*4882a593SmuzhiyunSYM_FUNC_END(aesbs_encrypt8)
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun	.align		4
494*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aesbs_decrypt8)
495*4882a593Smuzhiyun	lsl		x9, rounds, #7
496*4882a593Smuzhiyun	add		bskey, bskey, x9
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun	ldr		q9, [bskey, #-112]!		// round 0 key
499*4882a593Smuzhiyun	ldr		q8, M0ISR
500*4882a593Smuzhiyun	ldr		q24, ISR
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
503*4882a593Smuzhiyun	eor		v11.16b, v1.16b, v9.16b
504*4882a593Smuzhiyun	tbl		v0.16b, {v10.16b}, v8.16b
505*4882a593Smuzhiyun	eor		v12.16b, v2.16b, v9.16b
506*4882a593Smuzhiyun	tbl		v1.16b, {v11.16b}, v8.16b
507*4882a593Smuzhiyun	eor		v13.16b, v3.16b, v9.16b
508*4882a593Smuzhiyun	tbl		v2.16b, {v12.16b}, v8.16b
509*4882a593Smuzhiyun	eor		v14.16b, v4.16b, v9.16b
510*4882a593Smuzhiyun	tbl		v3.16b, {v13.16b}, v8.16b
511*4882a593Smuzhiyun	eor		v15.16b, v5.16b, v9.16b
512*4882a593Smuzhiyun	tbl		v4.16b, {v14.16b}, v8.16b
513*4882a593Smuzhiyun	eor		v10.16b, v6.16b, v9.16b
514*4882a593Smuzhiyun	tbl		v5.16b, {v15.16b}, v8.16b
515*4882a593Smuzhiyun	eor		v11.16b, v7.16b, v9.16b
516*4882a593Smuzhiyun	tbl		v6.16b, {v10.16b}, v8.16b
517*4882a593Smuzhiyun	tbl		v7.16b, {v11.16b}, v8.16b
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun	sub		rounds, rounds, #1
522*4882a593Smuzhiyun	b		.Ldec_sbox
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun.Ldec_loop:
525*4882a593Smuzhiyun	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
526*4882a593Smuzhiyun.Ldec_sbox:
527*4882a593Smuzhiyun	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
528*4882a593Smuzhiyun								v13, v14, v15
529*4882a593Smuzhiyun	subs		rounds, rounds, #1
530*4882a593Smuzhiyun	b.cc		.Ldec_done
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun	dec_next_rk
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
537*4882a593Smuzhiyun								v13, v14, v15
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun	b.ne		.Ldec_loop
540*4882a593Smuzhiyun	ldr		q24, ISRM0
541*4882a593Smuzhiyun	b		.Ldec_loop
542*4882a593Smuzhiyun.Ldec_done:
543*4882a593Smuzhiyun	ldr		q12, [bskey, #-16]		// last round key
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v12.16b
548*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v12.16b
549*4882a593Smuzhiyun	eor		v6.16b, v6.16b, v12.16b
550*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v12.16b
551*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v12.16b
552*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v12.16b
553*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v12.16b
554*4882a593Smuzhiyun	eor		v5.16b, v5.16b, v12.16b
555*4882a593Smuzhiyun	ret
556*4882a593SmuzhiyunSYM_FUNC_END(aesbs_decrypt8)
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun	/*
559*4882a593Smuzhiyun	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
560*4882a593Smuzhiyun	 *		     int blocks)
561*4882a593Smuzhiyun	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
562*4882a593Smuzhiyun	 *		     int blocks)
563*4882a593Smuzhiyun	 */
564*4882a593Smuzhiyun	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
565*4882a593Smuzhiyun	frame_push	5
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun	mov		x19, x0
568*4882a593Smuzhiyun	mov		x20, x1
569*4882a593Smuzhiyun	mov		x21, x2
570*4882a593Smuzhiyun	mov		x22, x3
571*4882a593Smuzhiyun	mov		x23, x4
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun99:	mov		x5, #1
574*4882a593Smuzhiyun	lsl		x5, x5, x23
575*4882a593Smuzhiyun	subs		w23, w23, #8
576*4882a593Smuzhiyun	csel		x23, x23, xzr, pl
577*4882a593Smuzhiyun	csel		x5, x5, xzr, mi
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun	ld1		{v0.16b}, [x20], #16
580*4882a593Smuzhiyun	tbnz		x5, #1, 0f
581*4882a593Smuzhiyun	ld1		{v1.16b}, [x20], #16
582*4882a593Smuzhiyun	tbnz		x5, #2, 0f
583*4882a593Smuzhiyun	ld1		{v2.16b}, [x20], #16
584*4882a593Smuzhiyun	tbnz		x5, #3, 0f
585*4882a593Smuzhiyun	ld1		{v3.16b}, [x20], #16
586*4882a593Smuzhiyun	tbnz		x5, #4, 0f
587*4882a593Smuzhiyun	ld1		{v4.16b}, [x20], #16
588*4882a593Smuzhiyun	tbnz		x5, #5, 0f
589*4882a593Smuzhiyun	ld1		{v5.16b}, [x20], #16
590*4882a593Smuzhiyun	tbnz		x5, #6, 0f
591*4882a593Smuzhiyun	ld1		{v6.16b}, [x20], #16
592*4882a593Smuzhiyun	tbnz		x5, #7, 0f
593*4882a593Smuzhiyun	ld1		{v7.16b}, [x20], #16
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun0:	mov		bskey, x21
596*4882a593Smuzhiyun	mov		rounds, x22
597*4882a593Smuzhiyun	bl		\do8
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun	st1		{\o0\().16b}, [x19], #16
600*4882a593Smuzhiyun	tbnz		x5, #1, 1f
601*4882a593Smuzhiyun	st1		{\o1\().16b}, [x19], #16
602*4882a593Smuzhiyun	tbnz		x5, #2, 1f
603*4882a593Smuzhiyun	st1		{\o2\().16b}, [x19], #16
604*4882a593Smuzhiyun	tbnz		x5, #3, 1f
605*4882a593Smuzhiyun	st1		{\o3\().16b}, [x19], #16
606*4882a593Smuzhiyun	tbnz		x5, #4, 1f
607*4882a593Smuzhiyun	st1		{\o4\().16b}, [x19], #16
608*4882a593Smuzhiyun	tbnz		x5, #5, 1f
609*4882a593Smuzhiyun	st1		{\o5\().16b}, [x19], #16
610*4882a593Smuzhiyun	tbnz		x5, #6, 1f
611*4882a593Smuzhiyun	st1		{\o6\().16b}, [x19], #16
612*4882a593Smuzhiyun	tbnz		x5, #7, 1f
613*4882a593Smuzhiyun	st1		{\o7\().16b}, [x19], #16
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun	cbz		x23, 1f
616*4882a593Smuzhiyun	b		99b
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun1:	frame_pop
619*4882a593Smuzhiyun	ret
620*4882a593Smuzhiyun	.endm
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun	.align		4
623*4882a593SmuzhiyunSYM_FUNC_START(aesbs_ecb_encrypt)
624*4882a593Smuzhiyun	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
625*4882a593SmuzhiyunSYM_FUNC_END(aesbs_ecb_encrypt)
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun	.align		4
628*4882a593SmuzhiyunSYM_FUNC_START(aesbs_ecb_decrypt)
629*4882a593Smuzhiyun	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
630*4882a593SmuzhiyunSYM_FUNC_END(aesbs_ecb_decrypt)
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun	/*
633*4882a593Smuzhiyun	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
634*4882a593Smuzhiyun	 *		     int blocks, u8 iv[])
635*4882a593Smuzhiyun	 */
636*4882a593Smuzhiyun	.align		4
637*4882a593SmuzhiyunSYM_FUNC_START(aesbs_cbc_decrypt)
638*4882a593Smuzhiyun	frame_push	6
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun	mov		x19, x0
641*4882a593Smuzhiyun	mov		x20, x1
642*4882a593Smuzhiyun	mov		x21, x2
643*4882a593Smuzhiyun	mov		x22, x3
644*4882a593Smuzhiyun	mov		x23, x4
645*4882a593Smuzhiyun	mov		x24, x5
646*4882a593Smuzhiyun
647*4882a593Smuzhiyun99:	mov		x6, #1
648*4882a593Smuzhiyun	lsl		x6, x6, x23
649*4882a593Smuzhiyun	subs		w23, w23, #8
650*4882a593Smuzhiyun	csel		x23, x23, xzr, pl
651*4882a593Smuzhiyun	csel		x6, x6, xzr, mi
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun	ld1		{v0.16b}, [x20], #16
654*4882a593Smuzhiyun	mov		v25.16b, v0.16b
655*4882a593Smuzhiyun	tbnz		x6, #1, 0f
656*4882a593Smuzhiyun	ld1		{v1.16b}, [x20], #16
657*4882a593Smuzhiyun	mov		v26.16b, v1.16b
658*4882a593Smuzhiyun	tbnz		x6, #2, 0f
659*4882a593Smuzhiyun	ld1		{v2.16b}, [x20], #16
660*4882a593Smuzhiyun	mov		v27.16b, v2.16b
661*4882a593Smuzhiyun	tbnz		x6, #3, 0f
662*4882a593Smuzhiyun	ld1		{v3.16b}, [x20], #16
663*4882a593Smuzhiyun	mov		v28.16b, v3.16b
664*4882a593Smuzhiyun	tbnz		x6, #4, 0f
665*4882a593Smuzhiyun	ld1		{v4.16b}, [x20], #16
666*4882a593Smuzhiyun	mov		v29.16b, v4.16b
667*4882a593Smuzhiyun	tbnz		x6, #5, 0f
668*4882a593Smuzhiyun	ld1		{v5.16b}, [x20], #16
669*4882a593Smuzhiyun	mov		v30.16b, v5.16b
670*4882a593Smuzhiyun	tbnz		x6, #6, 0f
671*4882a593Smuzhiyun	ld1		{v6.16b}, [x20], #16
672*4882a593Smuzhiyun	mov		v31.16b, v6.16b
673*4882a593Smuzhiyun	tbnz		x6, #7, 0f
674*4882a593Smuzhiyun	ld1		{v7.16b}, [x20]
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun0:	mov		bskey, x21
677*4882a593Smuzhiyun	mov		rounds, x22
678*4882a593Smuzhiyun	bl		aesbs_decrypt8
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun	ld1		{v24.16b}, [x24]		// load IV
681*4882a593Smuzhiyun
682*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v25.16b
683*4882a593Smuzhiyun	eor		v6.16b, v6.16b, v26.16b
684*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v27.16b
685*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v28.16b
686*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v29.16b
687*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v24.16b
688*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v30.16b
689*4882a593Smuzhiyun	eor		v5.16b, v5.16b, v31.16b
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun	st1		{v0.16b}, [x19], #16
692*4882a593Smuzhiyun	mov		v24.16b, v25.16b
693*4882a593Smuzhiyun	tbnz		x6, #1, 1f
694*4882a593Smuzhiyun	st1		{v1.16b}, [x19], #16
695*4882a593Smuzhiyun	mov		v24.16b, v26.16b
696*4882a593Smuzhiyun	tbnz		x6, #2, 1f
697*4882a593Smuzhiyun	st1		{v6.16b}, [x19], #16
698*4882a593Smuzhiyun	mov		v24.16b, v27.16b
699*4882a593Smuzhiyun	tbnz		x6, #3, 1f
700*4882a593Smuzhiyun	st1		{v4.16b}, [x19], #16
701*4882a593Smuzhiyun	mov		v24.16b, v28.16b
702*4882a593Smuzhiyun	tbnz		x6, #4, 1f
703*4882a593Smuzhiyun	st1		{v2.16b}, [x19], #16
704*4882a593Smuzhiyun	mov		v24.16b, v29.16b
705*4882a593Smuzhiyun	tbnz		x6, #5, 1f
706*4882a593Smuzhiyun	st1		{v7.16b}, [x19], #16
707*4882a593Smuzhiyun	mov		v24.16b, v30.16b
708*4882a593Smuzhiyun	tbnz		x6, #6, 1f
709*4882a593Smuzhiyun	st1		{v3.16b}, [x19], #16
710*4882a593Smuzhiyun	mov		v24.16b, v31.16b
711*4882a593Smuzhiyun	tbnz		x6, #7, 1f
712*4882a593Smuzhiyun	ld1		{v24.16b}, [x20], #16
713*4882a593Smuzhiyun	st1		{v5.16b}, [x19], #16
714*4882a593Smuzhiyun1:	st1		{v24.16b}, [x24]		// store IV
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun	cbz		x23, 2f
717*4882a593Smuzhiyun	b		99b
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun2:	frame_pop
720*4882a593Smuzhiyun	ret
721*4882a593SmuzhiyunSYM_FUNC_END(aesbs_cbc_decrypt)
722*4882a593Smuzhiyun
723*4882a593Smuzhiyun	.macro		next_tweak, out, in, const, tmp
724*4882a593Smuzhiyun	sshr		\tmp\().2d,  \in\().2d,   #63
725*4882a593Smuzhiyun	and		\tmp\().16b, \tmp\().16b, \const\().16b
726*4882a593Smuzhiyun	add		\out\().2d,  \in\().2d,   \in\().2d
727*4882a593Smuzhiyun	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
728*4882a593Smuzhiyun	eor		\out\().16b, \out\().16b, \tmp\().16b
729*4882a593Smuzhiyun	.endm
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun	/*
732*4882a593Smuzhiyun	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
733*4882a593Smuzhiyun	 *		     int blocks, u8 iv[])
734*4882a593Smuzhiyun	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735*4882a593Smuzhiyun	 *		     int blocks, u8 iv[])
736*4882a593Smuzhiyun	 */
737*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__xts_crypt8)
738*4882a593Smuzhiyun	mov		x6, #1
739*4882a593Smuzhiyun	lsl		x6, x6, x23
740*4882a593Smuzhiyun	subs		w23, w23, #8
741*4882a593Smuzhiyun	csel		x23, x23, xzr, pl
742*4882a593Smuzhiyun	csel		x6, x6, xzr, mi
743*4882a593Smuzhiyun
744*4882a593Smuzhiyun	ld1		{v0.16b}, [x20], #16
745*4882a593Smuzhiyun	next_tweak	v26, v25, v30, v31
746*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v25.16b
747*4882a593Smuzhiyun	tbnz		x6, #1, 0f
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun	ld1		{v1.16b}, [x20], #16
750*4882a593Smuzhiyun	next_tweak	v27, v26, v30, v31
751*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v26.16b
752*4882a593Smuzhiyun	tbnz		x6, #2, 0f
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun	ld1		{v2.16b}, [x20], #16
755*4882a593Smuzhiyun	next_tweak	v28, v27, v30, v31
756*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v27.16b
757*4882a593Smuzhiyun	tbnz		x6, #3, 0f
758*4882a593Smuzhiyun
759*4882a593Smuzhiyun	ld1		{v3.16b}, [x20], #16
760*4882a593Smuzhiyun	next_tweak	v29, v28, v30, v31
761*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v28.16b
762*4882a593Smuzhiyun	tbnz		x6, #4, 0f
763*4882a593Smuzhiyun
764*4882a593Smuzhiyun	ld1		{v4.16b}, [x20], #16
765*4882a593Smuzhiyun	str		q29, [sp, #.Lframe_local_offset]
766*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v29.16b
767*4882a593Smuzhiyun	next_tweak	v29, v29, v30, v31
768*4882a593Smuzhiyun	tbnz		x6, #5, 0f
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun	ld1		{v5.16b}, [x20], #16
771*4882a593Smuzhiyun	str		q29, [sp, #.Lframe_local_offset + 16]
772*4882a593Smuzhiyun	eor		v5.16b, v5.16b, v29.16b
773*4882a593Smuzhiyun	next_tweak	v29, v29, v30, v31
774*4882a593Smuzhiyun	tbnz		x6, #6, 0f
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun	ld1		{v6.16b}, [x20], #16
777*4882a593Smuzhiyun	str		q29, [sp, #.Lframe_local_offset + 32]
778*4882a593Smuzhiyun	eor		v6.16b, v6.16b, v29.16b
779*4882a593Smuzhiyun	next_tweak	v29, v29, v30, v31
780*4882a593Smuzhiyun	tbnz		x6, #7, 0f
781*4882a593Smuzhiyun
782*4882a593Smuzhiyun	ld1		{v7.16b}, [x20], #16
783*4882a593Smuzhiyun	str		q29, [sp, #.Lframe_local_offset + 48]
784*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v29.16b
785*4882a593Smuzhiyun	next_tweak	v29, v29, v30, v31
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun0:	mov		bskey, x21
788*4882a593Smuzhiyun	mov		rounds, x22
789*4882a593Smuzhiyun	br		x16
790*4882a593SmuzhiyunSYM_FUNC_END(__xts_crypt8)
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
793*4882a593Smuzhiyun	frame_push	6, 64
794*4882a593Smuzhiyun
795*4882a593Smuzhiyun	mov		x19, x0
796*4882a593Smuzhiyun	mov		x20, x1
797*4882a593Smuzhiyun	mov		x21, x2
798*4882a593Smuzhiyun	mov		x22, x3
799*4882a593Smuzhiyun	mov		x23, x4
800*4882a593Smuzhiyun	mov		x24, x5
801*4882a593Smuzhiyun
802*4882a593Smuzhiyun	movi		v30.2s, #0x1
803*4882a593Smuzhiyun	movi		v25.2s, #0x87
804*4882a593Smuzhiyun	uzp1		v30.4s, v30.4s, v25.4s
805*4882a593Smuzhiyun	ld1		{v25.16b}, [x24]
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun99:	adr		x16, \do8
808*4882a593Smuzhiyun	bl		__xts_crypt8
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun	ldp		q16, q17, [sp, #.Lframe_local_offset]
811*4882a593Smuzhiyun	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun	eor		\o0\().16b, \o0\().16b, v25.16b
814*4882a593Smuzhiyun	eor		\o1\().16b, \o1\().16b, v26.16b
815*4882a593Smuzhiyun	eor		\o2\().16b, \o2\().16b, v27.16b
816*4882a593Smuzhiyun	eor		\o3\().16b, \o3\().16b, v28.16b
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun	st1		{\o0\().16b}, [x19], #16
819*4882a593Smuzhiyun	mov		v25.16b, v26.16b
820*4882a593Smuzhiyun	tbnz		x6, #1, 1f
821*4882a593Smuzhiyun	st1		{\o1\().16b}, [x19], #16
822*4882a593Smuzhiyun	mov		v25.16b, v27.16b
823*4882a593Smuzhiyun	tbnz		x6, #2, 1f
824*4882a593Smuzhiyun	st1		{\o2\().16b}, [x19], #16
825*4882a593Smuzhiyun	mov		v25.16b, v28.16b
826*4882a593Smuzhiyun	tbnz		x6, #3, 1f
827*4882a593Smuzhiyun	st1		{\o3\().16b}, [x19], #16
828*4882a593Smuzhiyun	mov		v25.16b, v29.16b
829*4882a593Smuzhiyun	tbnz		x6, #4, 1f
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun	eor		\o4\().16b, \o4\().16b, v16.16b
832*4882a593Smuzhiyun	eor		\o5\().16b, \o5\().16b, v17.16b
833*4882a593Smuzhiyun	eor		\o6\().16b, \o6\().16b, v18.16b
834*4882a593Smuzhiyun	eor		\o7\().16b, \o7\().16b, v19.16b
835*4882a593Smuzhiyun
836*4882a593Smuzhiyun	st1		{\o4\().16b}, [x19], #16
837*4882a593Smuzhiyun	tbnz		x6, #5, 1f
838*4882a593Smuzhiyun	st1		{\o5\().16b}, [x19], #16
839*4882a593Smuzhiyun	tbnz		x6, #6, 1f
840*4882a593Smuzhiyun	st1		{\o6\().16b}, [x19], #16
841*4882a593Smuzhiyun	tbnz		x6, #7, 1f
842*4882a593Smuzhiyun	st1		{\o7\().16b}, [x19], #16
843*4882a593Smuzhiyun
844*4882a593Smuzhiyun	cbz		x23, 1f
845*4882a593Smuzhiyun	st1		{v25.16b}, [x24]
846*4882a593Smuzhiyun
847*4882a593Smuzhiyun	b		99b
848*4882a593Smuzhiyun
849*4882a593Smuzhiyun1:	st1		{v25.16b}, [x24]
850*4882a593Smuzhiyun	frame_pop
851*4882a593Smuzhiyun	ret
852*4882a593Smuzhiyun	.endm
853*4882a593Smuzhiyun
854*4882a593SmuzhiyunSYM_FUNC_START(aesbs_xts_encrypt)
855*4882a593Smuzhiyun	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
856*4882a593SmuzhiyunSYM_FUNC_END(aesbs_xts_encrypt)
857*4882a593Smuzhiyun
858*4882a593SmuzhiyunSYM_FUNC_START(aesbs_xts_decrypt)
859*4882a593Smuzhiyun	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
860*4882a593SmuzhiyunSYM_FUNC_END(aesbs_xts_decrypt)
861*4882a593Smuzhiyun
862*4882a593Smuzhiyun	.macro		next_ctr, v
863*4882a593Smuzhiyun	mov		\v\().d[1], x8
864*4882a593Smuzhiyun	adds		x8, x8, #1
865*4882a593Smuzhiyun	mov		\v\().d[0], x7
866*4882a593Smuzhiyun	adc		x7, x7, xzr
867*4882a593Smuzhiyun	rev64		\v\().16b, \v\().16b
868*4882a593Smuzhiyun	.endm
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun	/*
871*4882a593Smuzhiyun	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
872*4882a593Smuzhiyun	 *		     int rounds, int blocks, u8 iv[], u8 final[])
873*4882a593Smuzhiyun	 */
874*4882a593SmuzhiyunSYM_FUNC_START(aesbs_ctr_encrypt)
875*4882a593Smuzhiyun	frame_push	8
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun	mov		x19, x0
878*4882a593Smuzhiyun	mov		x20, x1
879*4882a593Smuzhiyun	mov		x21, x2
880*4882a593Smuzhiyun	mov		x22, x3
881*4882a593Smuzhiyun	mov		x23, x4
882*4882a593Smuzhiyun	mov		x24, x5
883*4882a593Smuzhiyun	mov		x25, x6
884*4882a593Smuzhiyun
885*4882a593Smuzhiyun	cmp		x25, #0
886*4882a593Smuzhiyun	cset		x26, ne
887*4882a593Smuzhiyun	add		x23, x23, x26		// do one extra block if final
888*4882a593Smuzhiyun
889*4882a593Smuzhiyun	ldp		x7, x8, [x24]
890*4882a593Smuzhiyun	ld1		{v0.16b}, [x24]
891*4882a593SmuzhiyunCPU_LE(	rev		x7, x7		)
892*4882a593SmuzhiyunCPU_LE(	rev		x8, x8		)
893*4882a593Smuzhiyun	adds		x8, x8, #1
894*4882a593Smuzhiyun	adc		x7, x7, xzr
895*4882a593Smuzhiyun
896*4882a593Smuzhiyun99:	mov		x9, #1
897*4882a593Smuzhiyun	lsl		x9, x9, x23
898*4882a593Smuzhiyun	subs		w23, w23, #8
899*4882a593Smuzhiyun	csel		x23, x23, xzr, pl
900*4882a593Smuzhiyun	csel		x9, x9, xzr, le
901*4882a593Smuzhiyun
902*4882a593Smuzhiyun	tbnz		x9, #1, 0f
903*4882a593Smuzhiyun	next_ctr	v1
904*4882a593Smuzhiyun	tbnz		x9, #2, 0f
905*4882a593Smuzhiyun	next_ctr	v2
906*4882a593Smuzhiyun	tbnz		x9, #3, 0f
907*4882a593Smuzhiyun	next_ctr	v3
908*4882a593Smuzhiyun	tbnz		x9, #4, 0f
909*4882a593Smuzhiyun	next_ctr	v4
910*4882a593Smuzhiyun	tbnz		x9, #5, 0f
911*4882a593Smuzhiyun	next_ctr	v5
912*4882a593Smuzhiyun	tbnz		x9, #6, 0f
913*4882a593Smuzhiyun	next_ctr	v6
914*4882a593Smuzhiyun	tbnz		x9, #7, 0f
915*4882a593Smuzhiyun	next_ctr	v7
916*4882a593Smuzhiyun
917*4882a593Smuzhiyun0:	mov		bskey, x21
918*4882a593Smuzhiyun	mov		rounds, x22
919*4882a593Smuzhiyun	bl		aesbs_encrypt8
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun	lsr		x9, x9, x26		// disregard the extra block
922*4882a593Smuzhiyun	tbnz		x9, #0, 0f
923*4882a593Smuzhiyun
924*4882a593Smuzhiyun	ld1		{v8.16b}, [x20], #16
925*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v8.16b
926*4882a593Smuzhiyun	st1		{v0.16b}, [x19], #16
927*4882a593Smuzhiyun	tbnz		x9, #1, 1f
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun	ld1		{v9.16b}, [x20], #16
930*4882a593Smuzhiyun	eor		v1.16b, v1.16b, v9.16b
931*4882a593Smuzhiyun	st1		{v1.16b}, [x19], #16
932*4882a593Smuzhiyun	tbnz		x9, #2, 2f
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun	ld1		{v10.16b}, [x20], #16
935*4882a593Smuzhiyun	eor		v4.16b, v4.16b, v10.16b
936*4882a593Smuzhiyun	st1		{v4.16b}, [x19], #16
937*4882a593Smuzhiyun	tbnz		x9, #3, 3f
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun	ld1		{v11.16b}, [x20], #16
940*4882a593Smuzhiyun	eor		v6.16b, v6.16b, v11.16b
941*4882a593Smuzhiyun	st1		{v6.16b}, [x19], #16
942*4882a593Smuzhiyun	tbnz		x9, #4, 4f
943*4882a593Smuzhiyun
944*4882a593Smuzhiyun	ld1		{v12.16b}, [x20], #16
945*4882a593Smuzhiyun	eor		v3.16b, v3.16b, v12.16b
946*4882a593Smuzhiyun	st1		{v3.16b}, [x19], #16
947*4882a593Smuzhiyun	tbnz		x9, #5, 5f
948*4882a593Smuzhiyun
949*4882a593Smuzhiyun	ld1		{v13.16b}, [x20], #16
950*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v13.16b
951*4882a593Smuzhiyun	st1		{v7.16b}, [x19], #16
952*4882a593Smuzhiyun	tbnz		x9, #6, 6f
953*4882a593Smuzhiyun
954*4882a593Smuzhiyun	ld1		{v14.16b}, [x20], #16
955*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v14.16b
956*4882a593Smuzhiyun	st1		{v2.16b}, [x19], #16
957*4882a593Smuzhiyun	tbnz		x9, #7, 7f
958*4882a593Smuzhiyun
959*4882a593Smuzhiyun	ld1		{v15.16b}, [x20], #16
960*4882a593Smuzhiyun	eor		v5.16b, v5.16b, v15.16b
961*4882a593Smuzhiyun	st1		{v5.16b}, [x19], #16
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun8:	next_ctr	v0
964*4882a593Smuzhiyun	st1		{v0.16b}, [x24]
965*4882a593Smuzhiyun	cbz		x23, .Lctr_done
966*4882a593Smuzhiyun
967*4882a593Smuzhiyun	b		99b
968*4882a593Smuzhiyun
969*4882a593Smuzhiyun.Lctr_done:
970*4882a593Smuzhiyun	frame_pop
971*4882a593Smuzhiyun	ret
972*4882a593Smuzhiyun
973*4882a593Smuzhiyun	/*
974*4882a593Smuzhiyun	 * If we are handling the tail of the input (x6 != NULL), return the
975*4882a593Smuzhiyun	 * final keystream block back to the caller.
976*4882a593Smuzhiyun	 */
977*4882a593Smuzhiyun0:	cbz		x25, 8b
978*4882a593Smuzhiyun	st1		{v0.16b}, [x25]
979*4882a593Smuzhiyun	b		8b
980*4882a593Smuzhiyun1:	cbz		x25, 8b
981*4882a593Smuzhiyun	st1		{v1.16b}, [x25]
982*4882a593Smuzhiyun	b		8b
983*4882a593Smuzhiyun2:	cbz		x25, 8b
984*4882a593Smuzhiyun	st1		{v4.16b}, [x25]
985*4882a593Smuzhiyun	b		8b
986*4882a593Smuzhiyun3:	cbz		x25, 8b
987*4882a593Smuzhiyun	st1		{v6.16b}, [x25]
988*4882a593Smuzhiyun	b		8b
989*4882a593Smuzhiyun4:	cbz		x25, 8b
990*4882a593Smuzhiyun	st1		{v3.16b}, [x25]
991*4882a593Smuzhiyun	b		8b
992*4882a593Smuzhiyun5:	cbz		x25, 8b
993*4882a593Smuzhiyun	st1		{v7.16b}, [x25]
994*4882a593Smuzhiyun	b		8b
995*4882a593Smuzhiyun6:	cbz		x25, 8b
996*4882a593Smuzhiyun	st1		{v2.16b}, [x25]
997*4882a593Smuzhiyun	b		8b
998*4882a593Smuzhiyun7:	cbz		x25, 8b
999*4882a593Smuzhiyun	st1		{v5.16b}, [x25]
1000*4882a593Smuzhiyun	b		8b
1001*4882a593SmuzhiyunSYM_FUNC_END(aesbs_ctr_encrypt)
1002