xref: /OK3568_Linux_fs/kernel/arch/arm64/crypto/aes-neon.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/assembler.h>
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun#define AES_FUNC_START(func)		SYM_FUNC_START(neon_ ## func)
12*4882a593Smuzhiyun#define AES_FUNC_END(func)		SYM_FUNC_END(neon_ ## func)
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun	xtsmask		.req	v7
15*4882a593Smuzhiyun	cbciv		.req	v7
16*4882a593Smuzhiyun	vctr		.req	v4
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun	.macro		xts_reload_mask, tmp
19*4882a593Smuzhiyun	xts_load_mask	\tmp
20*4882a593Smuzhiyun	.endm
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun	/* special case for the neon-bs driver calling into this one for CTS */
23*4882a593Smuzhiyun	.macro		xts_cts_skip_tw, reg, lbl
24*4882a593Smuzhiyun	tbnz		\reg, #1, \lbl
25*4882a593Smuzhiyun	.endm
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun	/* multiply by polynomial 'x' in GF(2^8) */
28*4882a593Smuzhiyun	.macro		mul_by_x, out, in, temp, const
29*4882a593Smuzhiyun	sshr		\temp, \in, #7
30*4882a593Smuzhiyun	shl		\out, \in, #1
31*4882a593Smuzhiyun	and		\temp, \temp, \const
32*4882a593Smuzhiyun	eor		\out, \out, \temp
33*4882a593Smuzhiyun	.endm
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun	/* multiply by polynomial 'x^2' in GF(2^8) */
36*4882a593Smuzhiyun	.macro		mul_by_x2, out, in, temp, const
37*4882a593Smuzhiyun	ushr		\temp, \in, #6
38*4882a593Smuzhiyun	shl		\out, \in, #2
39*4882a593Smuzhiyun	pmul		\temp, \temp, \const
40*4882a593Smuzhiyun	eor		\out, \out, \temp
41*4882a593Smuzhiyun	.endm
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun	/* preload the entire Sbox */
44*4882a593Smuzhiyun	.macro		prepare, sbox, shiftrows, temp
45*4882a593Smuzhiyun	movi		v12.16b, #0x1b
46*4882a593Smuzhiyun	ldr_l		q13, \shiftrows, \temp
47*4882a593Smuzhiyun	ldr_l		q14, .Lror32by8, \temp
48*4882a593Smuzhiyun	adr_l		\temp, \sbox
49*4882a593Smuzhiyun	ld1		{v16.16b-v19.16b}, [\temp], #64
50*4882a593Smuzhiyun	ld1		{v20.16b-v23.16b}, [\temp], #64
51*4882a593Smuzhiyun	ld1		{v24.16b-v27.16b}, [\temp], #64
52*4882a593Smuzhiyun	ld1		{v28.16b-v31.16b}, [\temp]
53*4882a593Smuzhiyun	.endm
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun	/* do preload for encryption */
56*4882a593Smuzhiyun	.macro		enc_prepare, ignore0, ignore1, temp
57*4882a593Smuzhiyun	prepare		crypto_aes_sbox, .LForward_ShiftRows, \temp
58*4882a593Smuzhiyun	.endm
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun	.macro		enc_switch_key, ignore0, ignore1, temp
61*4882a593Smuzhiyun	/* do nothing */
62*4882a593Smuzhiyun	.endm
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun	/* do preload for decryption */
65*4882a593Smuzhiyun	.macro		dec_prepare, ignore0, ignore1, temp
66*4882a593Smuzhiyun	prepare		crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
67*4882a593Smuzhiyun	.endm
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun	/* apply SubBytes transformation using the the preloaded Sbox */
70*4882a593Smuzhiyun	.macro		sub_bytes, in
71*4882a593Smuzhiyun	sub		v9.16b, \in\().16b, v15.16b
72*4882a593Smuzhiyun	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
73*4882a593Smuzhiyun	sub		v10.16b, v9.16b, v15.16b
74*4882a593Smuzhiyun	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
75*4882a593Smuzhiyun	sub		v11.16b, v10.16b, v15.16b
76*4882a593Smuzhiyun	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
77*4882a593Smuzhiyun	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
78*4882a593Smuzhiyun	.endm
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	/* apply MixColumns transformation */
81*4882a593Smuzhiyun	.macro		mix_columns, in, enc
82*4882a593Smuzhiyun	.if		\enc == 0
83*4882a593Smuzhiyun	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
84*4882a593Smuzhiyun	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
85*4882a593Smuzhiyun	eor		\in\().16b, \in\().16b, v8.16b
86*4882a593Smuzhiyun	rev32		v8.8h, v8.8h
87*4882a593Smuzhiyun	eor		\in\().16b, \in\().16b, v8.16b
88*4882a593Smuzhiyun	.endif
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
91*4882a593Smuzhiyun	rev32		v8.8h, \in\().8h
92*4882a593Smuzhiyun	eor		v8.16b, v8.16b, v9.16b
93*4882a593Smuzhiyun	eor		\in\().16b, \in\().16b, v8.16b
94*4882a593Smuzhiyun	tbl		\in\().16b, {\in\().16b}, v14.16b
95*4882a593Smuzhiyun	eor		\in\().16b, \in\().16b, v8.16b
96*4882a593Smuzhiyun	.endm
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun	.macro		do_block, enc, in, rounds, rk, rkp, i
99*4882a593Smuzhiyun	ld1		{v15.4s}, [\rk]
100*4882a593Smuzhiyun	add		\rkp, \rk, #16
101*4882a593Smuzhiyun	mov		\i, \rounds
102*4882a593Smuzhiyun1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
103*4882a593Smuzhiyun	movi		v15.16b, #0x40
104*4882a593Smuzhiyun	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
105*4882a593Smuzhiyun	sub_bytes	\in
106*4882a593Smuzhiyun	subs		\i, \i, #1
107*4882a593Smuzhiyun	ld1		{v15.4s}, [\rkp], #16
108*4882a593Smuzhiyun	beq		2222f
109*4882a593Smuzhiyun	mix_columns	\in, \enc
110*4882a593Smuzhiyun	b		1111b
111*4882a593Smuzhiyun2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
112*4882a593Smuzhiyun	.endm
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun	.macro		encrypt_block, in, rounds, rk, rkp, i
115*4882a593Smuzhiyun	do_block	1, \in, \rounds, \rk, \rkp, \i
116*4882a593Smuzhiyun	.endm
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun	.macro		decrypt_block, in, rounds, rk, rkp, i
119*4882a593Smuzhiyun	do_block	0, \in, \rounds, \rk, \rkp, \i
120*4882a593Smuzhiyun	.endm
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun	/*
123*4882a593Smuzhiyun	 * Interleaved versions: functionally equivalent to the
124*4882a593Smuzhiyun	 * ones above, but applied to AES states in parallel.
125*4882a593Smuzhiyun	 */
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun	.macro		sub_bytes_4x, in0, in1, in2, in3
128*4882a593Smuzhiyun	sub		v8.16b, \in0\().16b, v15.16b
129*4882a593Smuzhiyun	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
130*4882a593Smuzhiyun	sub		v9.16b, \in1\().16b, v15.16b
131*4882a593Smuzhiyun	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
132*4882a593Smuzhiyun	sub		v10.16b, \in2\().16b, v15.16b
133*4882a593Smuzhiyun	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
134*4882a593Smuzhiyun	sub		v11.16b, \in3\().16b, v15.16b
135*4882a593Smuzhiyun	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
136*4882a593Smuzhiyun	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
137*4882a593Smuzhiyun	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
138*4882a593Smuzhiyun	sub		v8.16b, v8.16b, v15.16b
139*4882a593Smuzhiyun	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
140*4882a593Smuzhiyun	sub		v9.16b, v9.16b, v15.16b
141*4882a593Smuzhiyun	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
142*4882a593Smuzhiyun	sub		v10.16b, v10.16b, v15.16b
143*4882a593Smuzhiyun	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
144*4882a593Smuzhiyun	sub		v11.16b, v11.16b, v15.16b
145*4882a593Smuzhiyun	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
146*4882a593Smuzhiyun	sub		v8.16b, v8.16b, v15.16b
147*4882a593Smuzhiyun	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
148*4882a593Smuzhiyun	sub		v9.16b, v9.16b, v15.16b
149*4882a593Smuzhiyun	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
150*4882a593Smuzhiyun	sub		v10.16b, v10.16b, v15.16b
151*4882a593Smuzhiyun	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
152*4882a593Smuzhiyun	sub		v11.16b, v11.16b, v15.16b
153*4882a593Smuzhiyun	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
154*4882a593Smuzhiyun	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
155*4882a593Smuzhiyun	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
156*4882a593Smuzhiyun	.endm
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
159*4882a593Smuzhiyun	sshr		\tmp0\().16b, \in0\().16b, #7
160*4882a593Smuzhiyun	shl		\out0\().16b, \in0\().16b, #1
161*4882a593Smuzhiyun	sshr		\tmp1\().16b, \in1\().16b, #7
162*4882a593Smuzhiyun	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
163*4882a593Smuzhiyun	shl		\out1\().16b, \in1\().16b, #1
164*4882a593Smuzhiyun	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
165*4882a593Smuzhiyun	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
166*4882a593Smuzhiyun	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
167*4882a593Smuzhiyun	.endm
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
170*4882a593Smuzhiyun	ushr		\tmp0\().16b, \in0\().16b, #6
171*4882a593Smuzhiyun	shl		\out0\().16b, \in0\().16b, #2
172*4882a593Smuzhiyun	ushr		\tmp1\().16b, \in1\().16b, #6
173*4882a593Smuzhiyun	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
174*4882a593Smuzhiyun	shl		\out1\().16b, \in1\().16b, #2
175*4882a593Smuzhiyun	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
176*4882a593Smuzhiyun	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
177*4882a593Smuzhiyun	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
178*4882a593Smuzhiyun	.endm
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun	.macro		mix_columns_2x, in0, in1, enc
181*4882a593Smuzhiyun	.if		\enc == 0
182*4882a593Smuzhiyun	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
183*4882a593Smuzhiyun	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
184*4882a593Smuzhiyun	eor		\in0\().16b, \in0\().16b, v8.16b
185*4882a593Smuzhiyun	rev32		v8.8h, v8.8h
186*4882a593Smuzhiyun	eor		\in1\().16b, \in1\().16b, v9.16b
187*4882a593Smuzhiyun	rev32		v9.8h, v9.8h
188*4882a593Smuzhiyun	eor		\in0\().16b, \in0\().16b, v8.16b
189*4882a593Smuzhiyun	eor		\in1\().16b, \in1\().16b, v9.16b
190*4882a593Smuzhiyun	.endif
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
193*4882a593Smuzhiyun	rev32		v10.8h, \in0\().8h
194*4882a593Smuzhiyun	rev32		v11.8h, \in1\().8h
195*4882a593Smuzhiyun	eor		v10.16b, v10.16b, v8.16b
196*4882a593Smuzhiyun	eor		v11.16b, v11.16b, v9.16b
197*4882a593Smuzhiyun	eor		\in0\().16b, \in0\().16b, v10.16b
198*4882a593Smuzhiyun	eor		\in1\().16b, \in1\().16b, v11.16b
199*4882a593Smuzhiyun	tbl		\in0\().16b, {\in0\().16b}, v14.16b
200*4882a593Smuzhiyun	tbl		\in1\().16b, {\in1\().16b}, v14.16b
201*4882a593Smuzhiyun	eor		\in0\().16b, \in0\().16b, v10.16b
202*4882a593Smuzhiyun	eor		\in1\().16b, \in1\().16b, v11.16b
203*4882a593Smuzhiyun	.endm
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
206*4882a593Smuzhiyun	ld1		{v15.4s}, [\rk]
207*4882a593Smuzhiyun	add		\rkp, \rk, #16
208*4882a593Smuzhiyun	mov		\i, \rounds
209*4882a593Smuzhiyun1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
210*4882a593Smuzhiyun	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
211*4882a593Smuzhiyun	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
212*4882a593Smuzhiyun	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
213*4882a593Smuzhiyun	movi		v15.16b, #0x40
214*4882a593Smuzhiyun	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
215*4882a593Smuzhiyun	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
216*4882a593Smuzhiyun	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
217*4882a593Smuzhiyun	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
218*4882a593Smuzhiyun	sub_bytes_4x	\in0, \in1, \in2, \in3
219*4882a593Smuzhiyun	subs		\i, \i, #1
220*4882a593Smuzhiyun	ld1		{v15.4s}, [\rkp], #16
221*4882a593Smuzhiyun	beq		2222f
222*4882a593Smuzhiyun	mix_columns_2x	\in0, \in1, \enc
223*4882a593Smuzhiyun	mix_columns_2x	\in2, \in3, \enc
224*4882a593Smuzhiyun	b		1111b
225*4882a593Smuzhiyun2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
226*4882a593Smuzhiyun	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
227*4882a593Smuzhiyun	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
228*4882a593Smuzhiyun	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
229*4882a593Smuzhiyun	.endm
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
232*4882a593Smuzhiyun	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
233*4882a593Smuzhiyun	.endm
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
236*4882a593Smuzhiyun	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
237*4882a593Smuzhiyun	.endm
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun#include "aes-modes.S"
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun	.section	".rodata", "a"
242*4882a593Smuzhiyun	.align		4
243*4882a593Smuzhiyun.LForward_ShiftRows:
244*4882a593Smuzhiyun	.octa		0x0b06010c07020d08030e09040f0a0500
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun.LReverse_ShiftRows:
247*4882a593Smuzhiyun	.octa		0x0306090c0f0205080b0e0104070a0d00
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun.Lror32by8:
250*4882a593Smuzhiyun	.octa		0x0c0f0e0d080b0a090407060500030201
251