xref: /optee_os/core/arch/arm/crypto/sm4_armv8a_ce_a64.S (revision 6f48ab5d3e350bca562beced9e3a261f24f02396)
1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (c) Hisilicon Technologies Co., Ltd. 2023. All rights reserved.
4 * Copyright (C) 2022, Alibaba Group.
5 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
6 *
7 * SM4 optimization for ARMv8 by SM4 HW instruction, which is an optional
8 * Cryptographic Extension for ARMv8.2-A.
9 *
10 * The CE implementation refers to Linux kernel (sm4-ce-core.S contributed
11 * by Tianjia Zhang <tianjia.zhang@linux.alibaba.com>).
12 */
13
14#include <asm.S>
15
16.arch	armv8.2-a+crypto+sm4
17
18#define tw0l	x7
19#define tw0h	x8
20#define tw1l	x9
21#define tw1h	x10
22#define tw2l	x11
23#define tw2h	x12
24#define tw3l	x13
25#define tw3h	x14
26#define tw4l	x15
27#define tw4h	x16
28#define tw5l	x17
29#define tw5h	x18
30#define tw6l	x19
31#define tw6h	x20
32#define tw7l	x21
33#define tw7h	x22
34#define tmpw0	w23
35#define tmpx0	x23
36#define tmpw1	w24
37#define tmpx1	x24
38#define tmpw2	w25
39
40/* round keys: v0-v7 */
41#define RK0	v0
42#define RK1	v1
43#define RK2	v2
44#define RK3	v3
45#define RK4	v4
46#define RK5	v5
47#define RK6	v6
48#define RK7	v7
49
50/* plain blocks: v8-v15 */
51#define BLK0	v8
52#define BLK1	v9
53#define BLK2	v10
54#define BLK3	v11
55#define BLK4	v12
56#define BLK5	v13
57#define BLK6	v14
58#define BLK7	v15
59
60#define TMP0	v16
61#define TMP1	v17
62#define TMP2	v18
63#define TMP3	v19
64#define TMP4	v20
65#define TMP5	v21
66#define TMP6	v22
67#define TMP7	v23
68#define	TMP8	v24
69#define	IV	v25
70
71.macro frame_push
72	stp	x15, x16, [sp, #-0x10]!
73	stp	x17, x18, [sp, #-0x10]!
74	stp	x19, x20, [sp, #-0x10]!
75	stp	x21, x22, [sp, #-0x10]!
76	stp	x23, x24, [sp, #-0x10]!
77	stp	x25, x26, [sp, #-0x10]!
78	stp	x27, x28, [sp, #-0x10]!
79	stp	x29, x30, [sp, #-0x10]!
80	stp	d8, d9, [sp, #-0x10]!
81	stp	d10, d11, [sp, #-0x10]!
82	stp	d12, d13, [sp, #-0x10]!
83	stp	d14, d15, [sp, #-0x10]!
84.endm
85
86.macro frame_pop
87	ldp	d14, d15, [sp], #0x10
88	ldp	d12, d13, [sp], #0x10
89	ldp	d10, d11, [sp], #0x10
90	ldp	d8, d9, [sp], #0x10
91	ldp	x29, x30, [sp], #0x10
92	ldp	x27, x28, [sp], #0x10
93	ldp	x25, x26, [sp], #0x10
94	ldp	x23, x24, [sp], #0x10
95	ldp	x21, x22, [sp], #0x10
96	ldp	x19, x20, [sp], #0x10
97	ldp	x17, x18, [sp], #0x10
98	ldp	x15, x16, [sp], #0x10
99.endm
100
101.macro	encrypt_block_no_rev, in
102	sm4e	\in\().4s, RK0.4s
103	sm4e	\in\().4s, RK1.4s
104	sm4e	\in\().4s, RK2.4s
105	sm4e	\in\().4s, RK3.4s
106	sm4e	\in\().4s, RK4.4s
107	sm4e	\in\().4s, RK5.4s
108	sm4e	\in\().4s, RK6.4s
109	sm4e	\in\().4s, RK7.4s
110	rev64	\in\().4s, \in\().4s
111	ext	\in\().16b, \in\().16b, \in\().16b, #8
112.endm
113
114.macro	encrypt_block, in
115	rev32	\in\().16b, \in\().16b
116	sm4e	\in\().4s, RK0.4s
117	sm4e	\in\().4s, RK1.4s
118	sm4e	\in\().4s, RK2.4s
119	sm4e	\in\().4s, RK3.4s
120	sm4e	\in\().4s, RK4.4s
121	sm4e	\in\().4s, RK5.4s
122	sm4e	\in\().4s, RK6.4s
123	sm4e	\in\().4s, RK7.4s
124	rev64	\in\().16b, \in\().16b
125	ext	\in\().16b, \in\().16b, \in\().16b, #8
126.endm
127
128.macro	decrypt_block, in
129	rev32	\in\().16b, \in\().16b
130	rev64	RK7.4s, RK7.4s;
131	rev64	RK6.4s, RK6.4s;
132	rev64	RK5.4s, RK5.4s;
133	rev64	RK4.4s, RK4.4s;
134	ext	RK7.16b, RK7.16b, RK7.16b, #8;
135	ext	RK6.16b, RK6.16b, RK6.16b, #8;
136	ext	RK5.16b, RK5.16b, RK5.16b, #8;
137	ext	RK4.16b, RK4.16b, RK4.16b, #8;
138	sm4e	\in\().4s, RK7.4s
139	sm4e	\in\().4s, RK6.4s
140	sm4e	\in\().4s, RK5.4s
141	sm4e	\in\().4s, RK4.4s
142	rev64	RK3.4s, RK3.4s;
143	rev64	RK2.4s, RK2.4s;
144	rev64	RK1.4s, RK1.4s;
145	rev64	RK0.4s, RK0.4s;
146	ext	RK3.16b, RK3.16b, RK3.16b, #8;
147	ext	RK2.16b, RK2.16b, RK2.16b, #8;
148	ext	RK1.16b, RK1.16b, RK1.16b, #8;
149	ext	RK0.16b, RK0.16b, RK0.16b, #8;
150	sm4e	\in\().4s, RK3.4s
151	sm4e	\in\().4s, RK2.4s
152	sm4e	\in\().4s, RK1.4s
153	sm4e	\in\().4s, RK0.4s
154	rev64	\in\().16b, \in\().16b
155	ext	\in\().16b, \in\().16b, \in\().16b, #8
156.endm
157
158LOCAL_FUNC sm4_encrypt_block1x , :
159	encrypt_block	BLK0
160	ret
161END_FUNC sm4_encrypt_block1x
162
163LOCAL_FUNC sm4_decrypt_block1x , :
164	decrypt_block	BLK0
165	ret
166END_FUNC sm4_decrypt_block1x
167
168LOCAL_FUNC sm4_encrypt_block4x , :
169	rev32	BLK0.16b, BLK0.16b
170	rev32	BLK1.16b, BLK1.16b
171	rev32	BLK2.16b, BLK2.16b
172	rev32	BLK3.16b, BLK3.16b
173
174	sm4e	BLK0.4s, RK0.4s
175	sm4e	BLK1.4s, RK0.4s
176	sm4e	BLK2.4s, RK0.4s
177	sm4e	BLK3.4s, RK0.4s
178
179	sm4e	BLK0.4s, RK1.4s
180	sm4e	BLK1.4s, RK1.4s
181	sm4e	BLK2.4s, RK1.4s
182	sm4e	BLK3.4s, RK1.4s
183
184	sm4e	BLK0.4s, RK2.4s
185	sm4e	BLK1.4s, RK2.4s
186	sm4e	BLK2.4s, RK2.4s
187	sm4e	BLK3.4s, RK2.4s
188
189	sm4e	BLK0.4s, RK3.4s
190	sm4e	BLK1.4s, RK3.4s
191	sm4e	BLK2.4s, RK3.4s
192	sm4e	BLK3.4s, RK3.4s
193
194	sm4e	BLK0.4s, RK4.4s
195	sm4e	BLK1.4s, RK4.4s
196	sm4e	BLK2.4s, RK4.4s
197	sm4e	BLK3.4s, RK4.4s
198
199	sm4e	BLK0.4s, RK5.4s
200	sm4e	BLK1.4s, RK5.4s
201	sm4e	BLK2.4s, RK5.4s
202	sm4e	BLK3.4s, RK5.4s
203
204	sm4e	BLK0.4s, RK6.4s
205	sm4e	BLK1.4s, RK6.4s
206	sm4e	BLK2.4s, RK6.4s
207	sm4e	BLK3.4s, RK6.4s
208
209	sm4e	BLK0.4s, RK7.4s
210	sm4e	BLK1.4s, RK7.4s
211	sm4e	BLK2.4s, RK7.4s
212	sm4e	BLK3.4s, RK7.4s
213
214	rev64	BLK0.16b, BLK0.16b
215	rev64	BLK1.16b, BLK1.16b
216	rev64	BLK2.16b, BLK2.16b
217	rev64	BLK3.16b, BLK3.16b
218
219	ext	BLK0.16b, BLK0.16b, BLK0.16b, #8
220	ext	BLK1.16b, BLK1.16b, BLK1.16b, #8
221	ext	BLK2.16b, BLK2.16b, BLK2.16b, #8
222	ext	BLK3.16b, BLK3.16b, BLK3.16b, #8
223	ret
224END_FUNC sm4_encrypt_block4x
225
226LOCAL_FUNC sm4_encrypt_block8x , :
227	rev32	BLK0.16b, BLK0.16b
228	rev32	BLK1.16b, BLK1.16b
229	rev32	BLK2.16b, BLK2.16b
230	rev32	BLK3.16b, BLK3.16b
231	rev32	BLK4.16b, BLK4.16b
232	rev32	BLK5.16b, BLK5.16b
233	rev32	BLK6.16b, BLK6.16b
234	rev32	BLK7.16b, BLK7.16b
235
236	sm4e	BLK0.4s, RK0.4s
237	sm4e	BLK1.4s, RK0.4s
238	sm4e	BLK2.4s, RK0.4s
239	sm4e	BLK3.4s, RK0.4s
240	sm4e	BLK4.4s, RK0.4s
241	sm4e	BLK5.4s, RK0.4s
242	sm4e	BLK6.4s, RK0.4s
243	sm4e	BLK7.4s, RK0.4s
244
245	sm4e	BLK0.4s, RK1.4s
246	sm4e	BLK1.4s, RK1.4s
247	sm4e	BLK2.4s, RK1.4s
248	sm4e	BLK3.4s, RK1.4s
249	sm4e	BLK4.4s, RK1.4s
250	sm4e	BLK5.4s, RK1.4s
251	sm4e	BLK6.4s, RK1.4s
252	sm4e	BLK7.4s, RK1.4s
253
254	sm4e	BLK0.4s, RK2.4s
255	sm4e	BLK1.4s, RK2.4s
256	sm4e	BLK2.4s, RK2.4s
257	sm4e	BLK3.4s, RK2.4s
258	sm4e	BLK4.4s, RK2.4s
259	sm4e	BLK5.4s, RK2.4s
260	sm4e	BLK6.4s, RK2.4s
261	sm4e	BLK7.4s, RK2.4s
262
263	sm4e	BLK0.4s, RK3.4s
264	sm4e	BLK1.4s, RK3.4s
265	sm4e	BLK2.4s, RK3.4s
266	sm4e	BLK3.4s, RK3.4s
267	sm4e	BLK4.4s, RK3.4s
268	sm4e	BLK5.4s, RK3.4s
269	sm4e	BLK6.4s, RK3.4s
270	sm4e	BLK7.4s, RK3.4s
271
272	sm4e	BLK0.4s, RK4.4s
273	sm4e	BLK1.4s, RK4.4s
274	sm4e	BLK2.4s, RK4.4s
275	sm4e	BLK3.4s, RK4.4s
276	sm4e	BLK4.4s, RK4.4s
277	sm4e	BLK5.4s, RK4.4s
278	sm4e	BLK6.4s, RK4.4s
279	sm4e	BLK7.4s, RK4.4s
280
281	sm4e	BLK0.4s, RK5.4s
282	sm4e	BLK1.4s, RK5.4s
283	sm4e	BLK2.4s, RK5.4s
284	sm4e	BLK3.4s, RK5.4s
285	sm4e	BLK4.4s, RK5.4s
286	sm4e	BLK5.4s, RK5.4s
287	sm4e	BLK6.4s, RK5.4s
288	sm4e	BLK7.4s, RK5.4s
289
290	sm4e	BLK0.4s, RK6.4s
291	sm4e	BLK1.4s, RK6.4s
292	sm4e	BLK2.4s, RK6.4s
293	sm4e	BLK3.4s, RK6.4s
294	sm4e	BLK4.4s, RK6.4s
295	sm4e	BLK5.4s, RK6.4s
296	sm4e	BLK6.4s, RK6.4s
297	sm4e	BLK7.4s, RK6.4s
298
299	sm4e	BLK0.4s, RK7.4s
300	sm4e	BLK1.4s, RK7.4s
301	sm4e	BLK2.4s, RK7.4s
302	sm4e	BLK3.4s, RK7.4s
303	sm4e	BLK4.4s, RK7.4s
304	sm4e	BLK5.4s, RK7.4s
305	sm4e	BLK6.4s, RK7.4s
306	sm4e	BLK7.4s, RK7.4s
307
308	rev64	BLK0.16b, BLK0.16b
309	rev64	BLK1.16b, BLK1.16b
310	rev64	BLK2.16b, BLK2.16b
311	rev64	BLK3.16b, BLK3.16b
312	rev64	BLK4.16b, BLK4.16b
313	rev64	BLK5.16b, BLK5.16b
314	rev64	BLK6.16b, BLK6.16b
315	rev64	BLK7.16b, BLK7.16b
316
317	ext	BLK0.16b, BLK0.16b, BLK0.16b, #8
318	ext	BLK1.16b, BLK1.16b, BLK1.16b, #8
319	ext	BLK2.16b, BLK2.16b, BLK2.16b, #8
320	ext	BLK3.16b, BLK3.16b, BLK3.16b, #8
321	ext	BLK4.16b, BLK4.16b, BLK4.16b, #8
322	ext	BLK5.16b, BLK5.16b, BLK5.16b, #8
323	ext	BLK6.16b, BLK6.16b, BLK6.16b, #8
324	ext	BLK7.16b, BLK7.16b, BLK7.16b, #8
325	ret
326END_FUNC sm4_encrypt_block8x
327
328.macro	inc_le128, vctr, low, high
329	mov	\vctr\().d[1], \high
330	mov	\vctr\().d[0], \low
331	adds	\high, \high, #1
332	adc	\low, \low, xzr
333	rev64	\vctr\().16b, \vctr\().16b
334.endm
335
336.macro	mov_reg_to_vec, desv, src0, src1
337	mov	\desv\().d[0], \src0
338	mov	\desv\().d[1], \src1
339.endm
340
341.macro	next_tweak, des0, des1, src0, src1
342	mov	tmpw2, 0x87
343	extr	tmpx0, \src1, \src1, #32
344	extr	\des1, \src1, \src0, #63
345	and	tmpw1, tmpw2, tmpw0, asr#31
346	eor	\des0, tmpx1, \src0, lsl#1
347.endm
348
349.macro	next_tweak_vec, desv, srcv
350	mov	tw0l, \srcv\().d[0]
351	mov	tw0h, \srcv\().d[1]
352	next_tweak	tw1l, tw1h, tw0l, tw0h
353	mov	\desv\().d[0], tw1l
354	mov	\desv\().d[1], tw1h
355.endm
356
357LOCAL_DATA .Lck , :
358	.long	0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
359	.long	0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
360	.long	0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
361	.long	0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
362	.long	0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
363	.long	0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
364	.long	0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
365	.long	0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
366END_DATA .Lck
367
368LOCAL_DATA .Lfk , :
369	.long	0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
370END_DATA .Lfk
371
372/*
373 * void ce_sm4_setkey_enc(uint32_t sk[32], uint8_t const key[16]);
374 * x0: round key
375 * x1: user key
376 */
377FUNC ce_sm4_setkey_enc , :
378	ld1	{RK0.4s}, [x1]
379	adr	x2, .Lfk
380	ld1	{TMP8.4s}, [x2]
381	adr	x2, .Lck
382	ld1	{TMP0.4s, TMP1.4s, TMP2.4s, TMP3.4s}, [x2], 64
383	rev32	RK0.16b, RK0.16b
384	ld1	{TMP4.4s, TMP5.4s, TMP6.4s, TMP7.4s}, [x2]
385	eor	RK0.16b, RK0.16b, TMP8.16b
386	sm4ekey RK0.4s, RK0.4s, TMP0.4s
387	sm4ekey RK1.4s, RK0.4s, TMP1.4s
388	sm4ekey RK2.4s, RK1.4s, TMP2.4s
389	sm4ekey RK3.4s, RK2.4s, TMP3.4s
390	sm4ekey RK4.4s, RK3.4s, TMP4.4s
391	st1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x0], 64
392	sm4ekey RK5.4s, RK4.4s, TMP5.4s
393	sm4ekey RK6.4s, RK5.4s, TMP6.4s
394	sm4ekey RK7.4s, RK6.4s, TMP7.4s
395	st1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x0]
396	ret
397END_FUNC ce_sm4_setkey_enc
398
399/*
400 * void ce_sm4_setkey_dec(uint32_t sk[32], uint8_t const key[16]);
401 * x0: round key
402 * x1: user key
403 */
404FUNC ce_sm4_setkey_dec , :
405	ld1	{RK7.4s}, [x1]
406	adr	x2, .Lfk
407	ld1	{TMP8.4s}, [x2]
408	adr	x2, .Lck
409	ld1	{TMP0.4s, TMP1.4s, TMP2.4s, TMP3.4s}, [x2], 64
410	rev32	RK7.16b, RK7.16b
411	ld1	{TMP4.4s, TMP5.4s, TMP6.4s, TMP7.4s}, [x2]
412	eor	RK7.16b, RK7.16b, TMP8.16b;
413	sm4ekey	RK7.4s, RK7.4s, TMP0.4s
414	sm4ekey	RK6.4s, RK7.4s, TMP1.4s
415	sm4ekey	RK5.4s, RK6.4s, TMP2.4s
416	rev64	RK7.4s, RK7.4s
417	rev64	RK6.4s, RK6.4s
418	ext	RK7.16b, RK7.16b, RK7.16b, #8
419	ext	RK6.16b, RK6.16b, RK6.16b, #8
420	sm4ekey	RK4.4s, RK5.4s, TMP3.4s
421	sm4ekey	RK3.4s, RK4.4s, TMP4.4s
422	rev64	RK5.4s, RK5.4s
423	rev64	RK4.4s, RK4.4s
424	ext	RK5.16b, RK5.16b, RK5.16b, #8
425	ext	RK4.16b, RK4.16b, RK4.16b, #8
426	sm4ekey	RK2.4s, RK3.4s, TMP5.4s
427	sm4ekey	RK1.4s, RK2.4s, TMP6.4s
428	rev64	RK3.4s, RK3.4s
429	rev64	RK2.4s, RK2.4s
430	ext	RK3.16b, RK3.16b, RK3.16b, #8
431	ext	RK2.16b, RK2.16b, RK2.16b, #8
432	sm4ekey	RK0.4s, RK1.4s, TMP7.4s
433	rev64	RK1.4s, RK1.4s
434	rev64	RK0.4s, RK0.4s
435	ext	RK1.16b, RK1.16b, RK1.16b, #8
436	ext	RK0.16b, RK0.16b, RK0.16b, #8
437	st1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x0], 64
438	st1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x0]
439	ret
440END_FUNC ce_sm4_setkey_dec
441
442/*
443 * void ce_sm4_ecb_encrypt(uint8_t out[], uint8_t const in[],
444 *			   uint8_t const rk[], size_t len);
445 * x0: output
446 * x1: input
447 * x2: round key
448 * w3: length
449 */
450FUNC ce_sm4_ecb_encrypt , :
451	frame_push
452
453	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
454	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
455	lsr	w3, w3, 4
456
457.Lecbloop8x:
458	cmp	w3, 8
459	b.lt	.Lecb4x
460	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
461	ld1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64
462	bl	sm4_encrypt_block8x
463	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
464	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
465	subs	w3, w3, #8
466	b.gt	.Lecbloop8x
467
468.Lecb4x:
469	cmp	w3, 1
470	b.lt	.Lecbout
471	cmp	w3, 2
472	b.lt	.Lecb1x
473	cmp	w3, 3
474	b.lt	.Lecb2x
475	cmp	w3, 4
476	b.lt	.Lecb3x
477	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
478	bl	sm4_encrypt_block4x
479	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
480	sub	w3, w3, #4
481	b	.Lecb4x
482
483.Lecb3x:
484	ld1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48
485	bl	sm4_encrypt_block4x
486	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
487	subs	w3, w3, #3
488	b.le	.Lecbout
489
490.Lecb2x:
491	ld1	{BLK0.16b, BLK1.16b}, [x1], #32
492	bl	sm4_encrypt_block4x
493	st1	{BLK0.16b, BLK1.16b}, [x0], #32
494	subs	w3, w3, #2
495	b.le	.Lecbout
496
497.Lecb1x:
498	ld1	{BLK0.16b}, [x1], #16
499	bl	sm4_encrypt_block1x
500	st1	{BLK0.16b}, [x0], #16
501
502.Lecbout:
503	frame_pop
504	ret
505
506END_FUNC ce_sm4_ecb_encrypt
507
508/*
509 * void ce_sm4_cbc_encrypt(uint8_t out[], uint8_t const in[],
510 *			   uint8_t const rk[], size_t len,
511 *			   uint8_t iv[]);
512 * x0: output
513 * x1: input
514 * x2: round key
515 * w3: length
516 * x4: iv
517 */
518FUNC ce_sm4_cbc_encrypt , :
519	frame_push
520
521	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
522	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
523	lsr	w3, w3, 4
524	ld1	{IV.16b}, [x4]
525
526.Lcbcencloop4x:
527	cmp	w3, 4
528	b.lt	.Lcbcenc1x
529	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
530	eor	BLK0.16b, BLK0.16b, IV.16b
531	rev32	BLK0.16b, BLK0.16b
532	rev32	BLK1.16b, BLK1.16b
533	rev32	BLK2.16b, BLK2.16b
534	rev32	BLK3.16b, BLK3.16b
535	encrypt_block_no_rev	BLK0
536	eor	BLK1.16b, BLK1.16b, BLK0.16b
537	encrypt_block_no_rev	BLK1
538	rev32	BLK0.16b, BLK0.16b
539	eor	BLK2.16b, BLK2.16b, BLK1.16b
540	encrypt_block_no_rev	BLK2
541	rev32	BLK1.16b, BLK1.16b
542	eor	BLK3.16b, BLK3.16b, BLK2.16b
543	encrypt_block_no_rev	BLK3
544	rev32	BLK2.16b, BLK2.16b
545	rev32	BLK3.16b, BLK3.16b
546	mov	IV.16b, BLK3.16b
547	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
548	subs	w3, w3, #4
549	b	.Lcbcencloop4x
550.Lcbcenc1x:
551	cmp	w3, 1
552	b.lt	.Lcbcencout
553.Lcbcencloop:
554	ld1	{BLK0.16b}, [x1], #16
555	eor	BLK0.16b, BLK0.16b, IV.16b
556	bl	sm4_encrypt_block1x
557	mov	IV.16b, BLK0.16b
558	st1	{BLK0.16b}, [x0], #16
559	subs	w3, w3, #1
560	bne	.Lcbcencloop
561.Lcbcencout:
562	st1	{IV.16b}, [x4]
563	frame_pop
564	ret
565END_FUNC ce_sm4_cbc_encrypt
566
567/*
568 * void ce_sm4_cbc_decrypt(uint8_t out[], uint8_t const in[],
569 *			   uint8_t const rk[], size_t len,
570 *			   uint8_t iv[]);
571 * x0: output
572 * x1: input
573 * x2: round key
574 * w3: length
575 * x4: iv
576 */
577FUNC ce_sm4_cbc_decrypt , :
578	frame_push
579
580	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
581	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
582	lsr	w3, w3, 4
583	ld1	{IV.16b}, [x4]
584
585.Lcbcdecloop8x:
586	cmp	w3, 8
587	b.lt	.Lcbcdec4x
588
589	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
590	ld1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64
591	bl	sm4_encrypt_block8x
592	sub	x5, x1, #128
593	eor	BLK0.16b, BLK0.16b, IV.16b
594	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x5], #64
595	eor	BLK1.16b, BLK1.16b, TMP0.16b
596	eor	BLK2.16b, BLK2.16b, TMP1.16b
597	eor	BLK3.16b, BLK3.16b, TMP2.16b
598	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
599	ld1	{TMP4.16b, TMP5.16b, TMP6.16b, TMP7.16b}, [x5], #64
600	eor	BLK4.16b, BLK4.16b, TMP3.16b
601	eor	BLK5.16b, BLK5.16b, TMP4.16b
602	mov	IV.16b, TMP7.16b
603	eor	BLK6.16b, BLK6.16b, TMP5.16b
604	eor	BLK7.16b, BLK7.16b, TMP6.16b
605	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
606	subs	w3, w3, #8
607	b.gt	.Lcbcdecloop8x
608
609.Lcbcdec4x:
610	cmp	w3, 1
611	b.lt	.Lcbcdecout
612	cmp	w3, 2
613	b.lt	.Lcbcdec1x
614	cmp	w3, 3
615	b.lt	.Lcbcdec2x
616	cmp	w3, 4
617	b.lt	.Lcbcdec3x
618	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
619	bl	sm4_encrypt_block4x
620	sub	x5, x1, 64
621	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x5], #64
622	eor	BLK0.16b, BLK0.16b, IV.16b
623	eor	BLK1.16b, BLK1.16b, TMP0.16b
624	eor	BLK2.16b, BLK2.16b, TMP1.16b
625	eor	BLK3.16b, BLK3.16b, TMP2.16b
626	mov	IV.16b, TMP3.16b
627	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
628	sub	w3, w3, #4
629	b	.Lcbcdec4x
630
631.Lcbcdec3x:
632	ld1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48
633	bl	sm4_encrypt_block4x
634	sub	x5, x1, 48
635	ld1	{TMP0.16b, TMP1.16b, TMP2.16b}, [x5], #48
636	eor	BLK0.16b, BLK0.16b, IV.16b
637	eor	BLK1.16b, BLK1.16b, TMP0.16b
638	eor	BLK2.16b, BLK2.16b, TMP1.16b
639	mov	IV.16b, TMP2.16b
640	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
641	subs	w3, w3, #3
642	b.le	.Lcbcdecout
643
644.Lcbcdec2x:
645	ld1	{BLK0.16b, BLK1.16b}, [x1], #32
646	bl	sm4_encrypt_block4x
647	sub	x5, x1, 32
648	ld1	{TMP0.16b, TMP1.16b}, [x5], #32
649	eor	BLK0.16b, BLK0.16b, IV.16b
650	eor	BLK1.16b, BLK1.16b, TMP0.16b
651	mov	IV.16b, TMP1.16b
652	st1	{BLK0.16b, BLK1.16b}, [x0], #32
653	subs	w3, w3, #2
654	b.le	.Lcbcdecout
655
656.Lcbcdec1x:
657	ld1	{BLK0.16b}, [x1], #16
658	bl	sm4_encrypt_block1x
659	sub	x5, x1, 16
660	ld1	{TMP0.16b}, [x5], #16
661	eor	BLK0.16b, BLK0.16b, IV.16b
662	mov	IV.16b, TMP0.16b
663	st1	{BLK0.16b}, [x0], #16
664
665.Lcbcdecout:
666	st1	{IV.16b}, [x4]
667	frame_pop
668	ret
669END_FUNC ce_sm4_cbc_decrypt
670
671/*
672 * void ce_sm4_ctr_encrypt(uint8_t out[], uint8_t const in[],
673 *			   uint8_t const rk[], size_t len,
674 *			   uint8_t iv[]);
675 * x0: output
676 * x1: input
677 * x2: round key
678 * w3: length
679 * x4: iv
680 */
681FUNC ce_sm4_ctr_encrypt , :
682	frame_push
683
684	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
685	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
686	lsr	w3, w3, 4
687	ldp	x7, x8, [x4]
688	rev	x7, x7
689	rev	x8, x8
690
691.Lctrloop8x:
692	cmp	w3, 8
693	b.lt	.Lctr4x
694
695	/* construct CTRs */
696	inc_le128	BLK0, x7, x8
697	inc_le128	BLK1, x7, x8
698	inc_le128	BLK2, x7, x8
699	inc_le128	BLK3, x7, x8
700	inc_le128	BLK4, x7, x8
701	inc_le128	BLK5, x7, x8
702	inc_le128	BLK6, x7, x8
703	inc_le128	BLK7, x7, x8
704	bl	sm4_encrypt_block8x
705	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x1], #64
706	ld1	{TMP4.16b, TMP5.16b, TMP6.16b, TMP7.16b}, [x1], #64
707	eor	BLK0.16b, BLK0.16b, TMP0.16b
708	eor	BLK1.16b, BLK1.16b, TMP1.16b
709	eor	BLK2.16b, BLK2.16b, TMP2.16b
710	eor	BLK3.16b, BLK3.16b, TMP3.16b
711	eor	BLK4.16b, BLK4.16b, TMP4.16b
712	eor	BLK5.16b, BLK5.16b, TMP5.16b
713	eor	BLK6.16b, BLK6.16b, TMP6.16b
714	eor	BLK7.16b, BLK7.16b, TMP7.16b
715	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
716	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
717	subs	w3, w3, #8
718	b.gt	.Lctrloop8x
719
720.Lctr4x:
721	cmp	w3, 1
722	b.lt	.Lctrout
723	cmp	w3, 2
724	b.lt	.Lctr1x
725	cmp	w3, 3
726	b.lt	.Lctr2x
727	cmp	w3, 4
728	b.lt	.Lctr3x
729	inc_le128	BLK0, x7, x8
730	inc_le128	BLK1, x7, x8
731	inc_le128	BLK2, x7, x8
732	inc_le128	BLK3, x7, x8
733	bl	sm4_encrypt_block4x
734	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x1], #64
735	eor	BLK0.16b, BLK0.16b, TMP0.16b
736	eor	BLK1.16b, BLK1.16b, TMP1.16b
737	eor	BLK2.16b, BLK2.16b, TMP2.16b
738	eor	BLK3.16b, BLK3.16b, TMP3.16b
739	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
740	sub	w3, w3, #4
741	b	.Lctr4x
742
743.Lctr3x:
744	inc_le128	BLK0, x7, x8
745	inc_le128	BLK1, x7, x8
746	inc_le128	BLK2, x7, x8
747	bl	sm4_encrypt_block4x
748	ld1	{TMP0.16b, TMP1.16b, TMP2.16b}, [x1], #48
749	eor	BLK0.16b, BLK0.16b, TMP0.16b
750	eor	BLK1.16b, BLK1.16b, TMP1.16b
751	eor	BLK2.16b, BLK2.16b, TMP2.16b
752	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
753	subs	w3, w3, #3
754	b.le	.Lctrout
755
756.Lctr2x:
757	inc_le128	BLK0, x7, x8
758	inc_le128	BLK1, x7, x8
759	bl	sm4_encrypt_block4x
760	ld1	{TMP0.16b, TMP1.16b}, [x1], #32
761	eor	BLK0.16b, BLK0.16b, TMP0.16b
762	eor	BLK1.16b, BLK1.16b, TMP1.16b
763	st1	{BLK0.16b, BLK1.16b}, [x0], #32
764	subs	w3, w3, #2
765	b.le	.Lctrout
766
767.Lctr1x:
768	inc_le128	BLK0, x7, x8
769	bl	sm4_encrypt_block1x
770	ld1	{TMP0.16b}, [x1], #16
771	eor	BLK0.16b, BLK0.16b, TMP0.16b
772	st1	{BLK0.16b}, [x0], #16
773
774.Lctrout:
775	rev	x7, x7
776	rev	x8, x8
777	stp	x7, x8, [x4]
778	frame_pop
779	ret
780END_FUNC ce_sm4_ctr_encrypt
781
782/*
783 * x0: output
784 * x1: input
785 * x2: round key1
786 * x3: round key2
787 * w4: blocks
788 * x26: enc/dec
789 */
790LOCAL_FUNC xts_do_cipher , :
791	stp	x29, x30, [sp, #-16]!
792	mov	x29, sp
793
794	ld1	{IV.16b}, [x5]
795	/* load round key2 for first tweak */
796	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x3], #64
797	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x3], #64
798	encrypt_block	IV
799	/* load round key1 for block cipher */
800	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
801	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
802	/* w6: remain */
803	and	w6, w4, #0x0F
804	/* w4: blocks */
805	lsr	w4, w4, 4
806	/* blocks == 0: ret */
807	cmp	w4, #1
808	b.lt	.Lxtsout
809	cmp	w6, 0
810	b.eq	.Lxtsblks
811	subs	w4, w4, #1
812	b.eq	.Lxtstail
813.Lxtsblks:
814	mov	tw0l, IV.d[0]
815	mov	tw0h, IV.d[1]
816	next_tweak	tw1l, tw1h, tw0l, tw0h
817	next_tweak	tw2l, tw2h, tw1l, tw1h
818	next_tweak	tw3l, tw3h, tw2l, tw2h
819	next_tweak	tw4l, tw4h, tw3l, tw3h
820	next_tweak	tw5l, tw5h, tw4l, tw4h
821	next_tweak	tw6l, tw6h, tw5l, tw5h
822	next_tweak	tw7l, tw7h, tw6l, tw6h
823.Lxtsloop8x:
824	cmp	w4, 8
825	b.lt	.Lxts4x
826	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
827	mov_reg_to_vec	TMP0, tw0l, tw0h
828	mov_reg_to_vec	TMP1, tw1l, tw1h
829	mov_reg_to_vec	TMP2, tw2l, tw2h
830	mov_reg_to_vec	TMP3, tw3l, tw3h
831	eor BLK0.16b, BLK0.16b, TMP0.16b
832	eor BLK1.16b, BLK1.16b, TMP1.16b
833	eor BLK2.16b, BLK2.16b, TMP2.16b
834	eor BLK3.16b, BLK3.16b, TMP3.16b
835	ld1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64
836	mov_reg_to_vec	TMP4, tw4l, tw4h
837	mov_reg_to_vec	TMP5, tw5l, tw5h
838	mov_reg_to_vec	TMP6, tw6l, tw6h
839	mov_reg_to_vec	IV, tw7l, tw7h
840	eor BLK4.16b, BLK4.16b, TMP4.16b
841	eor BLK5.16b, BLK5.16b, TMP5.16b
842	eor BLK6.16b, BLK6.16b, TMP6.16b
843	eor BLK7.16b, BLK7.16b, IV.16b
844
845	bl	sm4_encrypt_block8x
846
847	mov_reg_to_vec	TMP0, tw0l, tw0h
848	next_tweak	tw0l, tw0h, tw7l, tw7h
849	mov_reg_to_vec	TMP1, tw1l, tw1h
850	next_tweak	tw1l, tw1h, tw0l, tw0h
851	mov_reg_to_vec	TMP2, tw2l, tw2h
852	next_tweak	tw2l, tw2h, tw1l, tw1h
853	mov_reg_to_vec	TMP3, tw3l, tw3h
854	next_tweak	tw3l, tw3h, tw2l, tw2h
855	mov_reg_to_vec	TMP4, tw4l, tw4h
856	next_tweak	tw4l, tw4h, tw3l, tw3h
857	mov_reg_to_vec	TMP5, tw5l, tw5h
858	next_tweak	tw5l, tw5h, tw4l, tw4h
859	mov_reg_to_vec	TMP6, tw6l, tw6h
860	next_tweak	tw6l, tw6h, tw5l, tw5h
861	mov_reg_to_vec	IV, tw7l, tw7h
862	next_tweak	tw7l, tw7h, tw6l, tw6h
863
864	eor BLK0.16b, BLK0.16b, TMP0.16b
865	eor BLK1.16b, BLK1.16b, TMP1.16b
866	eor BLK2.16b, BLK2.16b, TMP2.16b
867	eor BLK3.16b, BLK3.16b, TMP3.16b
868	eor BLK4.16b, BLK4.16b, TMP4.16b
869	eor BLK5.16b, BLK5.16b, TMP5.16b
870	eor BLK6.16b, BLK6.16b, TMP6.16b
871	eor BLK7.16b, BLK7.16b, IV.16b
872
873	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
874	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
875	subs	w4, w4, #8
876	b.gt	.Lxtsloop8x
877
878.Lxts4x:
879	cmp	w4, 1
880	b.lt	.Lxtsblksout
881	cmp	w4, 2
882	b.lt	.Lxts1x
883	cmp	w4, 3
884	b.lt	.Lxts2x
885	cmp	w4, 4
886	b.lt	.Lxts3x
887	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
888	mov_reg_to_vec	BLK4, tw0l, tw0h
889	mov_reg_to_vec	BLK5, tw1l, tw1h
890	mov_reg_to_vec	BLK6, tw2l, tw2h
891	mov_reg_to_vec	IV, tw3l, tw3h
892	eor	BLK0.16b, BLK0.16b, BLK4.16b
893	eor	BLK1.16b, BLK1.16b, BLK5.16b
894	eor	BLK2.16b, BLK2.16b, BLK6.16b
895	eor	BLK3.16b, BLK3.16b, IV.16b
896	bl	sm4_encrypt_block4x
897	eor	BLK0.16b, BLK0.16b, BLK4.16b
898	eor	BLK1.16b, BLK1.16b, BLK5.16b
899	eor	BLK2.16b, BLK2.16b, BLK6.16b
900	eor	BLK3.16b, BLK3.16b, IV.16b
901	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
902	sub	w4, w4, #4
903
904	mov	tw0l, tw4l
905	mov	tw0h, tw4h
906	mov	tw1l, tw5l
907	mov	tw1h, tw5h
908	mov	tw2l, tw6l
909	mov	tw2h, tw6h
910	b	.Lxts4x
911
912.Lxts3x:
913	ld1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48
914	mov_reg_to_vec	BLK4, tw0l, tw0h
915	mov_reg_to_vec	BLK5, tw1l, tw1h
916	mov_reg_to_vec	IV, tw2l, tw2h
917	eor	BLK0.16b, BLK0.16b, BLK4.16b
918	eor	BLK1.16b, BLK1.16b, BLK5.16b
919	eor	BLK2.16b, BLK2.16b, IV.16b
920	bl	sm4_encrypt_block4x
921	eor	BLK0.16b, BLK0.16b, BLK4.16b
922	eor	BLK1.16b, BLK1.16b, BLK5.16b
923	eor	BLK2.16b, BLK2.16b, IV.16b
924	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
925	subs	w4, w4, #3
926	b.le	.Lxtsblksout
927
928.Lxts2x:
929	ld1	{BLK0.16b, BLK1.16b}, [x1], #32
930	mov_reg_to_vec	BLK4, tw0l, tw0h
931	mov_reg_to_vec	IV, tw1l, tw1h
932	eor	BLK0.16b, BLK0.16b, BLK4.16b
933	eor	BLK1.16b, BLK1.16b, IV.16b
934	bl	sm4_encrypt_block4x
935	eor	BLK0.16b, BLK0.16b, BLK4.16b
936	eor	BLK1.16b, BLK1.16b, IV.16b
937	st1	{BLK0.16b, BLK1.16b}, [x0], #32
938	subs	w4, w4, #2
939	b.le	.Lxtsblksout
940
941.Lxts1x:
942	ld1	{BLK0.16b}, [x1], #16
943	mov_reg_to_vec	IV, tw0l, tw0h
944	eor	BLK0.16b, BLK0.16b, IV.16b
945	bl	sm4_encrypt_block1x
946	eor	BLK0.16b, BLK0.16b, IV.16b
947	st1	{BLK0.16b}, [x0], #16
948.Lxtsblksout:
949	cmp	w6, 0
950	/* if encrypt some blocks with a partial block */
951	next_tweak_vec	IV, IV
952	b.eq	.Lxtsout
953.Lxtstail:
954	next_tweak_vec	TMP7, IV
955	cmp	x26, 1
956	b.eq	1f
957	/* The last two tweaks IV, TMP7 need to be swapped for decryption */
958	mov	TMP8.16b, IV.16b
959	mov	IV.16b, TMP7.16b
960	mov	TMP7.16b, TMP8.16b
961	1:
962	ld1	{BLK0.16b}, [x1], #16
963	eor	BLK0.16b, BLK0.16b, IV.16b
964	bl	sm4_encrypt_block1x
965	eor	BLK0.16b, BLK0.16b, IV.16b
966	st1	{BLK0.16b}, [x0], #16
967	sub	x7, x0, 16
968	10:
969	subs	x6, x6, 1
970	ldrb	tmpw0, [x7, x6]
971	strb	tmpw0, [x0, x6]
972	ldrb	tmpw0, [x1, x6]
973	strb	tmpw0, [x7, x6]
974	b.gt	10b
975	ld1	{BLK0.16b}, [x7]
976	eor	BLK0.16b, BLK0.16b, TMP7.16b
977	bl	sm4_encrypt_block1x
978	eor	BLK0.16b, BLK0.16b, TMP7.16b
979	st1	{BLK0.16b}, [x7]
980
981.Lxtsout:
982	/* load round key2 for last tweak */
983	sub	x3, x3, #128
984	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x3], #64
985	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x3], #64
986	/* decrypt last tweak for next update */
987	decrypt_block	IV
988	st1	{IV.16b}, [x5]
989	ldp x29, x30, [sp], #16
990	ret
991END_FUNC xts_do_cipher
992
993/*
994 * void ce_sm4_xts_encrypt(uint8_t out[], uint8_t const in[],
995 * 			   uint8_t const rk1[], uint8_t const rk2[],
996 * 			   size_t len, uint8_t iv[])
997 * x0: output
998 * x1: input
999 * x2: round key1
1000 * x3: round key2
1001 * w4: len
1002 * x5: iv
1003 */
1004FUNC ce_sm4_xts_encrypt , :
1005	frame_push
1006	mov	x26, 1
1007	bl	xts_do_cipher
1008	frame_pop
1009	ret
1010
1011END_FUNC ce_sm4_xts_encrypt
1012
1013/*
1014 * void ce_sm4_xts_decrypt(uint8_t out[], uint8_t const in[],
1015 * 			   uint8_t const rk1[], uint8_t const rk2[],
1016 * 			   size_t len, uint8_t iv[])
1017 * x0: output
1018 * x1: input
1019 * x2: round key1
1020 * x3: round key2
1021 * w4: len
1022 * x5: iv
1023 */
1024FUNC ce_sm4_xts_decrypt , :
1025	frame_push
1026	mov	x26, 0
1027	bl	xts_do_cipher
1028	frame_pop
1029	ret
1030END_FUNC ce_sm4_xts_decrypt
1031
1032BTI(emit_aarch64_feature_1_and     GNU_PROPERTY_AARCH64_FEATURE_1_BTI)
1033