xref: /optee_os/core/arch/arm/crypto/sm4_armv8a_aese_a64.S (revision 8b5fb12e2fdef8168274f7efe616eb4f6529ba25)
1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (c) Hisilicon Technologies Co., Ltd. 2023. All rights reserved.
4 * Copyright (C) 2022, Alibaba Group.
5 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
6 *
7 * SM4 optimization for ARMv8 by NEON and AES HW instruction, which is an
8 * optional Cryptographic Extension for ARMv8-A.
9 *
10 * The NEON implementation refers to Linux kernel (sm4-neon-core.S contributed
11 * by Tianjia Zhang <tianjia.zhang@linux.alibaba.com>).
12 *
13 * The AES trick refers to sm4ni (https://github.com/mjosaarinen/sm4ni). The
14 * constants used in load_sbox_matrix are from this blog (https://www.cnblogs.
15 * com/kentle/p/15826075.html). We've done some further optimizations so the
16 * constants don't look the same.
17 */
18
19#include <asm.S>
20
21.arch	armv8-a+crypto
22
23#define m0	w9
24#define m1	w10
25#define m2	w11
26#define m3	w12
27#define tw0l	x7
28#define tw0h	x8
29#define tw1l	x9
30#define tw1h	x10
31#define tw2l	x11
32#define tw2h	x12
33#define tw3l	x13
34#define tw3h	x14
35#define tw4l	x15
36#define tw4h	x16
37#define tw5l	x17
38#define tw5h	x18
39#define tw6l	x19
40#define tw6h	x20
41#define tw7l	x21
42#define tw7h	x22
43#define tmpw0	w23
44#define tmpx0	x23
45#define tmpw1	w24
46#define tmpx1	x24
47#define tmpw2	w25
48
49/* round keys: v0-v7 */
50#define RK0	v0
51#define RK1	v1
52#define RK2	v2
53#define RK3	v3
54#define RK4	v4
55#define RK5	v5
56#define RK6	v6
57#define RK7	v7
58
59/* plain blocks: v8-v15 */
60#define BLK0	v8
61#define BLK1	v9
62#define BLK2	v10
63#define BLK3	v11
64#define BLK4	v12
65#define BLK5	v13
66#define BLK6	v14
67#define BLK7	v15
68
69#define TMP0	v16
70#define TMP1	v17
71#define TMP2	v18
72#define TMP3	v19
73#define TMP4	v20
74#define TMP5	v21
75#define TMP6	v22
76#define TMP7	v23
77#define	TMP8	v24
78#define	IV	v25
79#define ANDMASKV	v26
80#define ANDMASKQ	q26
81#define ATALMaskV	v27
82#define ATALMaskQ	q27
83#define ATAHMaskV	v28
84#define ATAHMaskQ	q28
85#define TALMaskV	v29
86#define TALMaskQ	q29
87#define TAHMaskV	v30
88#define TAHMaskQ	q30
89#define MASKV	v31
90#define MASKQ	q31
91
92.macro frame_push
93	stp	x15, x16, [sp, #-0x10]!
94	stp	x17, x18, [sp, #-0x10]!
95	stp	x19, x20, [sp, #-0x10]!
96	stp	x21, x22, [sp, #-0x10]!
97	stp	x23, x24, [sp, #-0x10]!
98	stp	x25, x26, [sp, #-0x10]!
99	stp	x27, x28, [sp, #-0x10]!
100	stp	x29, x30, [sp, #-0x10]!
101	stp	d8, d9, [sp, #-0x10]!
102	stp	d10, d11, [sp, #-0x10]!
103	stp	d12, d13, [sp, #-0x10]!
104	stp	d14, d15, [sp, #-0x10]!
105.endm
106
107.macro frame_pop
108	ldp	d14, d15, [sp], #0x10
109	ldp	d12, d13, [sp], #0x10
110	ldp	d10, d11, [sp], #0x10
111	ldp	d8, d9, [sp], #0x10
112	ldp	x29, x30, [sp], #0x10
113	ldp	x27, x28, [sp], #0x10
114	ldp	x25, x26, [sp], #0x10
115	ldp	x23, x24, [sp], #0x10
116	ldp	x21, x22, [sp], #0x10
117	ldp	x19, x20, [sp], #0x10
118	ldp	x17, x18, [sp], #0x10
119	ldp	x15, x16, [sp], #0x10
120.endm
121
122.macro load_sbox_matrix
123	ldr MASKQ, .Lsbox_magic
124	ldr TAHMaskQ, .Lsbox_magic+16
125	ldr TALMaskQ, .Lsbox_magic+32
126	ldr ATAHMaskQ, .Lsbox_magic+48
127	ldr ATALMaskQ, .Lsbox_magic+64
128	ldr ANDMASKQ, .Lsbox_magic+80
129.endm
130
131.macro	multi_matrix, x, high, low, tmp
132	ushr	\tmp\().16b, \x\().16b, 4
133	and	\x\().16b, \x\().16b, ANDMASKV.16b
134	tbl	\x\().16b, {\low\().16b}, \x\().16b
135	tbl	\tmp\().16b, {\high\().16b}, \tmp\().16b
136	eor	\x\().16b, \x\().16b, \tmp\().16b
137.endm
138
139.macro	sbox, des, src, tmp1, tmp2
140	tbl	\des\().16b, {\src\().16b}, MASKV.16b
141	multi_matrix	\des, TAHMaskV, TALMaskV, \tmp2
142	eor	\tmp1\().16b, \tmp1\().16b, \tmp1\().16b
143	aese	\des\().16b, \tmp1\().16b
144	multi_matrix	\des, ATAHMaskV, ATALMaskV, \tmp2
145.endm
146
147.macro	sbox_double, des0, src0, des1, src1, tmp1, tmp2
148	tbl	\des0\().16b, {\src0\().16b}, MASKV.16b
149	tbl	\des1\().16b, {\src1\().16b}, MASKV.16b
150	multi_matrix	\des0, TAHMaskV, TALMaskV, \tmp2
151	multi_matrix	\des1, TAHMaskV, TALMaskV, \tmp2
152	eor	\tmp1\().16b, \tmp1\().16b, \tmp1\().16b
153	aese	\des0\().16b, \tmp1\().16b
154	multi_matrix	\des0, ATAHMaskV, ATALMaskV, \tmp2
155	aese	\des1\().16b, \tmp1\().16b
156	multi_matrix	\des1, ATAHMaskV, ATALMaskV, \tmp2
157.endm
158
159.macro	round, c0, c1, c2, c3, k
160	mov	tmpw0, \k
161	eor	tmpw1, \c1, \c2
162	eor	tmpw0, \c3, tmpw0
163	eor	tmpw2, tmpw1, tmpw0
164	mov	TMP0.s[0], tmpw2
165	/* nonlinear transformation */
166	sbox	TMP1, TMP0, TMP2, TMP3
167	/* linear transformation */
168	mov	tmpw2, TMP1.s[0]
169	ror	tmpw0, tmpw2, #(32-10)
170	eor	tmpw0, tmpw0, tmpw2, ror #(32-2)
171	ror	tmpw1, tmpw2, #(32-24)
172	eor	tmpw1, tmpw1, tmpw2, ror #(32-18)
173	eor	tmpw0, tmpw0, tmpw1
174	eor	tmpw2, tmpw0, tmpw2
175	eor	\c0, \c0, tmpw2
176.endm
177
178.macro	round4_enc, k
179	round	m0, m1, m2, m3, \k\().s[0]
180	round	m1, m2, m3, m0, \k\().s[1]
181	round	m2, m3, m0, m1, \k\().s[2]
182	round	m3, m0, m1, m2, \k\().s[3]
183.endm
184
185.macro	round4_dec, k
186	round	m0, m1, m2, m3, \k\().s[3]
187	round	m1, m2, m3, m0, \k\().s[2]
188	round	m2, m3, m0, m1, \k\().s[1]
189	round	m3, m0, m1, m2, \k\().s[0]
190.endm
191
192.macro	encrypt_block_no_rev, in
193	mov	m0, \in\().s[0]
194	mov	m1, \in\().s[1]
195	mov	m2, \in\().s[2]
196	mov	m3, \in\().s[3]
197	round4_enc	RK0
198	round4_enc	RK1
199	round4_enc	RK2
200	round4_enc	RK3
201	round4_enc	RK4
202	round4_enc	RK5
203	round4_enc	RK6
204	round4_enc	RK7
205	mov	\in\().s[0], m3
206	mov	\in\().s[1], m2
207	mov	\in\().s[2], m1
208	mov	\in\().s[3], m0
209.endm
210
211.macro	encrypt_block, in
212	rev32	\in\().16b, \in\().16b
213	encrypt_block_no_rev	\in
214	rev32	\in\().16b, \in\().16b
215.endm
216
217.macro	decrypt_block_no_rev, in
218	mov	m0, \in\().s[0]
219	mov	m1, \in\().s[1]
220	mov	m2, \in\().s[2]
221	mov	m3, \in\().s[3]
222	round4_dec	RK7
223	round4_dec	RK6
224	round4_dec	RK5
225	round4_dec	RK4
226	round4_dec	RK3
227	round4_dec	RK2
228	round4_dec	RK1
229	round4_dec	RK0
230	mov	\in\().s[0], m3
231	mov	\in\().s[1], m2
232	mov	\in\().s[2], m1
233	mov	\in\().s[3], m0
234.endm
235
236.macro	decrypt_block, in
237	rev32	\in\().16b, \in\().16b
238	decrypt_block_no_rev	\in
239	rev32	\in\().16b, \in\().16b
240.endm
241
242LOCAL_FUNC sm4_encrypt_block1x , :
243	encrypt_block	BLK0
244	ret
245END_FUNC sm4_encrypt_block1x
246
247LOCAL_FUNC sm4_decrypt_block1x , :
248	decrypt_block	BLK0
249	ret
250END_FUNC sm4_decrypt_block1x
251
252.macro	transpose_4x4, s0, s1, s2, s3
253	zip1	TMP0.4s, \s0\().4s, \s1\().4s
254	zip1	TMP1.4s, \s2\().4s, \s3\().4s
255	zip2	TMP2.4s, \s0\().4s, \s1\().4s
256	zip2	TMP3.4s, \s2\().4s, \s3\().4s
257	zip1	\s0\().2d, TMP0.2d, TMP1.2d
258	zip2	\s1\().2d, TMP0.2d, TMP1.2d
259	zip1	\s2\().2d, TMP2.2d, TMP3.2d
260	zip2	\s3\().2d, TMP2.2d, TMP3.2d
261.endm
262
263.macro	rotate_clockwise_90, s0, s1, s2, s3
264	zip1	TMP0.4s, \s1\().4s, \s0\().4s
265	zip2	TMP1.4s, \s1\().4s, \s0\().4s
266	zip1	TMP2.4s, \s3\().4s, \s2\().4s
267	zip2	TMP3.4s, \s3\().4s, \s2\().4s
268	zip1	\s0\().2d, TMP2.2d, TMP0.2d
269	zip2	\s1\().2d, TMP2.2d, TMP0.2d
270	zip1	\s2\().2d, TMP3.2d, TMP1.2d
271	zip2	\s3\().2d, TMP3.2d, TMP1.2d
272.endm
273
274
275.macro	round_4x, s0, s1, s2, s3, k
276	dup	TMP8.4s, \k
277	eor	TMP1.16b, \s2\().16b, \s3\().16b
278	eor	TMP8.16b, TMP8.16b, \s1\().16b
279	eor	TMP8.16b, TMP8.16b, TMP1.16b
280
281	/* nonlinear transformation */
282	sbox	TMP0, TMP8, TMP2, TMP3
283
284	/* linear transformation */
285	shl	TMP1.4s, TMP0.4s, #2
286	shl	TMP2.4s, TMP0.4s, #10
287	shl	TMP3.4s, TMP0.4s, #18
288	shl	TMP4.4s, TMP0.4s, #24
289	sri	TMP1.4s, TMP0.4s, #(32-2)
290	sri	TMP2.4s, TMP0.4s, #(32-10)
291	sri	TMP3.4s, TMP0.4s, #(32-18)
292	sri	TMP4.4s, TMP0.4s, #(32-24)
293	eor	TMP0.16b, TMP0.16b, TMP1.16b
294	eor	TMP2.16b, TMP2.16b, TMP3.16b
295	eor	TMP4.16b, TMP4.16b, \s0\().16b
296	eor	TMP0.16b, TMP0.16b, TMP2.16b
297	eor	\s0\().16b, TMP0.16b, TMP4.16b
298.endm
299
300.macro	round4_4x, k
301	round_4x	BLK0, BLK1, BLK2, BLK3, \k\().s[0]
302	round_4x	BLK1, BLK2, BLK3, BLK0, \k\().s[1]
303	round_4x	BLK2, BLK3, BLK0, BLK1, \k\().s[2]
304	round_4x	BLK3, BLK0, BLK1, BLK2, \k\().s[3]
305.endm
306
307LOCAL_FUNC sm4_encrypt_block4x , :
308	rev32	BLK0.16b, BLK0.16b
309	rev32	BLK1.16b, BLK1.16b
310	rev32	BLK2.16b, BLK2.16b
311	rev32	BLK3.16b, BLK3.16b
312
313	transpose_4x4	BLK0, BLK1, BLK2, BLK3
314
315	round4_4x	RK0
316	round4_4x	RK1
317	round4_4x	RK2
318	round4_4x	RK3
319	round4_4x	RK4
320	round4_4x	RK5
321	round4_4x	RK6
322	round4_4x	RK7
323
324	rotate_clockwise_90	BLK0, BLK1, BLK2, BLK3
325	rev32	BLK0.16b, BLK0.16b
326	rev32	BLK1.16b, BLK1.16b
327	rev32	BLK2.16b, BLK2.16b
328	rev32	BLK3.16b, BLK3.16b
329	ret
330END_FUNC sm4_encrypt_block4x
331
332.macro	round_8x, s0, s1, s2, s3, t0, t1, t2, t3, k
333	dup	TMP8.4s, \k
334	eor	TMP0.16b, \s2\().16b, \s3\().16b
335	mov	TMP7.16b, TMP8.16b
336	eor	TMP1.16b, \t2\().16b, \t3\().16b
337	eor	TMP8.16b, TMP8.16b, \s1\().16b
338	eor	TMP7.16b, TMP7.16b, \t1\().16b
339	eor	TMP8.16b, TMP8.16b, TMP0.16b
340	eor	TMP7.16b, TMP7.16b, TMP1.16b
341
342	/* nonlinear transformation */
343	sbox_double	TMP0, TMP8, TMP1, TMP7, TMP2, TMP3
344
345	/* linear transformation */
346	shl	TMP6.4s, TMP0.4s, #2
347	shl	TMP8.4s, TMP1.4s, #2
348	shl	TMP2.4s, TMP0.4s, #10
349	shl	TMP5.4s, TMP1.4s, #10
350	shl	TMP3.4s, TMP0.4s, #18
351	shl	TMP4.4s, TMP0.4s, #24
352	sri	TMP6.4s, TMP0.4s, #(32-2)
353	sri	TMP2.4s, TMP0.4s, #(32-10)
354	sri	TMP3.4s, TMP0.4s, #(32-18)
355	sri	TMP4.4s, TMP0.4s, #(32-24)
356	eor	TMP0.16b, TMP0.16b, TMP6.16b
357	eor	TMP2.16b, TMP2.16b, TMP3.16b
358	shl	TMP6.4s, TMP1.4s, #18
359	shl	TMP7.4s, TMP1.4s, #24
360	sri	TMP8.4s, TMP1.4s, #(32-2)
361	sri	TMP5.4s, TMP1.4s, #(32-10)
362	sri	TMP6.4s, TMP1.4s, #(32-18)
363	sri	TMP7.4s, TMP1.4s, #(32-24)
364	eor	TMP4.16b, TMP4.16b, \s0\().16b
365	eor	TMP1.16b, TMP1.16b, TMP8.16b
366	eor	\s0\().16b, TMP0.16b, TMP2.16b
367	eor	\s0\().16b, \s0\().16b, TMP4.16b
368	eor	TMP5.16b, TMP5.16b, TMP6.16b
369	eor	TMP7.16b, TMP7.16b, \t0\().16b
370	eor	TMP1.16b, TMP1.16b, TMP5.16b
371	eor	\t0\().16b, TMP1.16b, TMP7.16b
372.endm
373
374.macro	round4_8x, k
375	round_8x BLK0, BLK1, BLK2, BLK3, BLK4, BLK5, BLK6, BLK7, \k\().s[0]
376	round_8x BLK1, BLK2, BLK3, BLK0, BLK5, BLK6, BLK7, BLK4, \k\().s[1]
377	round_8x BLK2, BLK3, BLK0, BLK1, BLK6, BLK7, BLK4, BLK5, \k\().s[2]
378	round_8x BLK3, BLK0, BLK1, BLK2, BLK7, BLK4, BLK5, BLK6, \k\().s[3]
379.endm
380
381LOCAL_FUNC sm4_encrypt_block8x , :
382	rev32	BLK0.16b, BLK0.16b
383	rev32	BLK1.16b, BLK1.16b
384	rev32	BLK2.16b, BLK2.16b
385	rev32	BLK3.16b, BLK3.16b
386	rev32	BLK4.16b, BLK4.16b
387	rev32	BLK5.16b, BLK5.16b
388	rev32	BLK6.16b, BLK6.16b
389	rev32	BLK7.16b, BLK7.16b
390
391	transpose_4x4	BLK0, BLK1, BLK2, BLK3
392	transpose_4x4	BLK4, BLK5, BLK6, BLK7
393
394	round4_8x	RK0
395	round4_8x	RK1
396	round4_8x	RK2
397	round4_8x	RK3
398	round4_8x	RK4
399	round4_8x	RK5
400	round4_8x	RK6
401	round4_8x	RK7
402
403	rotate_clockwise_90	BLK0, BLK1, BLK2, BLK3
404	rotate_clockwise_90	BLK4, BLK5, BLK6, BLK7
405
406	rev32	BLK0.16b, BLK0.16b
407	rev32	BLK1.16b, BLK1.16b
408	rev32	BLK2.16b, BLK2.16b
409	rev32	BLK3.16b, BLK3.16b
410	rev32	BLK4.16b, BLK4.16b
411	rev32	BLK5.16b, BLK5.16b
412	rev32	BLK6.16b, BLK6.16b
413	rev32	BLK7.16b, BLK7.16b
414	ret
415END_FUNC sm4_encrypt_block8x
416
417.macro	inc_le128, vctr, low, high
418	mov	\vctr\().d[1], \high
419	mov	\vctr\().d[0], \low
420	adds	\high, \high, #1
421	adc	\low, \low, xzr
422	rev64	\vctr\().16b, \vctr\().16b
423.endm
424
425.macro	mov_reg_to_vec, desv, src0, src1
426	mov	\desv\().d[0], \src0
427	mov	\desv\().d[1], \src1
428.endm
429
430.macro	next_tweak, des0, des1, src0, src1
431	mov	tmpw2, 0x87
432	extr	tmpx0, \src1, \src1, #32
433	extr	\des1, \src1, \src0, #63
434	and	tmpw1, tmpw2, tmpw0, asr#31
435	eor	\des0, tmpx1, \src0, lsl#1
436.endm
437
438.macro	next_tweak_vec, desv, srcv
439	mov	tw0l, \srcv\().d[0]
440	mov	tw0h, \srcv\().d[1]
441	next_tweak	tw1l, tw1h, tw0l, tw0h
442	mov	\desv\().d[0], tw1l
443	mov	\desv\().d[1], tw1h
444.endm
445
446LOCAL_DATA .Lck , :
447	.long	0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
448	.long	0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
449	.long	0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
450	.long	0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
451	.long	0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
452	.long	0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
453	.long	0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
454	.long	0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
455END_DATA .Lck
456
457LOCAL_DATA .Lfk , :
458	.long	0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
459END_DATA .Lfk
460
461LOCAL_DATA .Lshuffles , :
462	.long	0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100
463END_DATA .Lshuffles
464
465LOCAL_DATA .Lsbox_magic , :
466	.dword	0x0b0e0104070a0d00, 0x0306090c0f020508
467	.dword	0x62185a2042387a00, 0x22581a6002783a40
468	.dword	0x15df62a89e54e923, 0xc10bb67c4a803df7
469	.dword	0xb9aa6b78c1d21300, 0x1407c6d56c7fbead
470	.dword	0x6404462679195b3b, 0xe383c1a1fe9edcbc
471	.dword	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
472END_DATA .Lsbox_magic
473
474.macro	sm4_setkey
475	ld1	{v5.4s}, [x1]
476	load_sbox_matrix
477	rev32	v5.16b, v5.16b
478	adr	x5, .Lfk
479	ld1	{v6.4s}, [x5]
480	eor	v5.16b, v5.16b, v6.16b
481	mov	x6, #32
482	adr	x5, .Lck
4831:
484	mov	w7, v5.s[1]
485	ldr	w8, [x5], #4
486	eor	w8, w8, w7
487	mov	w7, v5.s[2]
488	eor	w8, w8, w7
489	mov	w7, v5.s[3]
490	eor	w8, w8, w7
491
492	/* optimize sbox using AESE instruction */
493	mov	TMP0.s[0], w8
494	sbox	TMP1, TMP0, TMP2, TMP3
495	mov	w7, TMP1.s[0]
496
497	/* linear transformation */
498	eor	w8, w7, w7, ror #19
499	eor	w8, w8, w7, ror #9
500	mov	w7, v5.s[0]
501	eor	w8, w8, w7
502	mov	v5.s[0], w8
503	ext	v5.16b, v5.16b, v5.16b, 4
504	subs	x6, x6, #1
505.endm
506
507/*
508 * void neon_sm4_setkey_enc(uint32_t sk[32], uint8_t const key[16]);
509 * x0: round key
510 * x1: user key
511 */
512FUNC neon_sm4_setkey_enc , :
513	sm4_setkey
514	str	w8, [x0], #4
515	b.ne	1b
516	ret
517END_FUNC neon_sm4_setkey_enc
518
519/*
520 * void neon_sm4_setkey_dec(uint32_t sk[32], uint8_t const key[16]);
521 * x0: round key
522 * x1: user key
523 */
524FUNC neon_sm4_setkey_dec , :
525	add	x0, x0, 124
526	sm4_setkey
527	str	w8, [x0], #-4
528	b.ne	1b
529	ret
530END_FUNC neon_sm4_setkey_dec
531
532/*
533 * void neon_sm4_ecb_encrypt(uint8_t out[], uint8_t const in[],
534 *			     uint8_t const rk[], size_t len);
535 * x0: output
536 * x1: input
537 * x2: round key
538 * w3: length
539 */
540FUNC neon_sm4_ecb_encrypt , :
541	frame_push
542	load_sbox_matrix
543	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
544	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
545	lsr	w3, w3, 4
546
547.Lecbloop8x:
548	cmp	w3, 8
549	b.lt	.Lecb4x
550	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
551	ld1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64
552	bl	sm4_encrypt_block8x
553	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
554	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
555	subs	w3, w3, #8
556	b.gt	.Lecbloop8x
557
558.Lecb4x:
559	cmp	w3, 1
560	b.lt	.Lecbout
561	cmp	w3, 2
562	b.lt	.Lecb1x
563	cmp	w3, 3
564	b.lt	.Lecb2x
565	cmp	w3, 4
566	b.lt	.Lecb3x
567	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
568	bl	sm4_encrypt_block4x
569	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
570	sub	w3, w3, #4
571	b	.Lecb4x
572
573.Lecb3x:
574	ld1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48
575	bl	sm4_encrypt_block4x
576	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
577	subs	w3, w3, #3
578	b.le	.Lecbout
579
580.Lecb2x:
581	ld1	{BLK0.16b, BLK1.16b}, [x1], #32
582	bl	sm4_encrypt_block4x
583	st1	{BLK0.16b, BLK1.16b}, [x0], #32
584	subs	w3, w3, #2
585	b.le	.Lecbout
586
587.Lecb1x:
588	ld1	{BLK0.16b}, [x1], #16
589	bl	sm4_encrypt_block1x
590	st1	{BLK0.16b}, [x0], #16
591
592.Lecbout:
593	frame_pop
594	ret
595
596END_FUNC neon_sm4_ecb_encrypt
597
598/*
599 * void neon_sm4_cbc_encrypt(uint8_t out[], uint8_t const in[],
600 *			     uint8_t const rk[], size_t len,
601 *			     uint8_t iv[]);
602 * x0: output
603 * x1: input
604 * x2: round key
605 * w3: length
606 * x4: iv
607 */
608FUNC neon_sm4_cbc_encrypt , :
609	frame_push
610	load_sbox_matrix
611
612	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
613	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
614	lsr	w3, w3, 4
615	ld1	{IV.16b}, [x4]
616
617.Lcbcencloop4x:
618	cmp	w3, 4
619	b.lt	.Lcbcenc1x
620	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
621	eor	BLK0.16b, BLK0.16b, IV.16b
622	rev32	BLK0.16b, BLK0.16b
623	rev32	BLK1.16b, BLK1.16b
624	rev32	BLK2.16b, BLK2.16b
625	rev32	BLK3.16b, BLK3.16b
626	encrypt_block_no_rev	BLK0
627	eor	BLK1.16b, BLK1.16b, BLK0.16b
628	encrypt_block_no_rev	BLK1
629	rev32	BLK0.16b, BLK0.16b
630	eor	BLK2.16b, BLK2.16b, BLK1.16b
631	encrypt_block_no_rev	BLK2
632	rev32	BLK1.16b, BLK1.16b
633	eor	BLK3.16b, BLK3.16b, BLK2.16b
634	encrypt_block_no_rev	BLK3
635	rev32	BLK2.16b, BLK2.16b
636	rev32	BLK3.16b, BLK3.16b
637	mov	IV.16b, BLK3.16b
638	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
639	subs	w3, w3, #4
640	b	.Lcbcencloop4x
641.Lcbcenc1x:
642	cmp	w3, 1
643	b.lt	.Lcbcencout
644.Lcbcencloop:
645	ld1	{BLK0.16b}, [x1], #16
646	eor	BLK0.16b, BLK0.16b, IV.16b
647	bl	sm4_encrypt_block1x
648	mov	IV.16b, BLK0.16b
649	st1	{BLK0.16b}, [x0], #16
650	subs	w3, w3, #1
651	bne	.Lcbcencloop
652.Lcbcencout:
653	st1	{IV.16b}, [x4]
654	frame_pop
655	ret
656END_FUNC neon_sm4_cbc_encrypt
657
658/*
659 * void neon_sm4_cbc_decrypt(uint8_t out[], uint8_t const in[],
660 *			     uint8_t const rk[], size_t len,
661 *			     uint8_t iv[]);
662 * x0: output
663 * x1: input
664 * x2: round key
665 * w3: length
666 * x4: iv
667 */
668FUNC neon_sm4_cbc_decrypt , :
669	frame_push
670	load_sbox_matrix
671
672	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
673	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
674	lsr	w3, w3, 4
675	ld1	{IV.16b}, [x4]
676
677.Lcbcdecloop8x:
678	cmp	w3, 8
679	b.lt	.Lcbcdec4x
680
681	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
682	ld1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64
683	bl	sm4_encrypt_block8x
684	sub	x5, x1, #128
685	eor	BLK0.16b, BLK0.16b, IV.16b
686	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x5], #64
687	eor	BLK1.16b, BLK1.16b, TMP0.16b
688	eor	BLK2.16b, BLK2.16b, TMP1.16b
689	eor	BLK3.16b, BLK3.16b, TMP2.16b
690	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
691	ld1	{TMP4.16b, TMP5.16b, TMP6.16b, TMP7.16b}, [x5], #64
692	eor	BLK4.16b, BLK4.16b, TMP3.16b
693	eor	BLK5.16b, BLK5.16b, TMP4.16b
694	mov	IV.16b, TMP7.16b
695	eor	BLK6.16b, BLK6.16b, TMP5.16b
696	eor	BLK7.16b, BLK7.16b, TMP6.16b
697	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
698	subs	w3, w3, #8
699	b.gt	.Lcbcdecloop8x
700
701.Lcbcdec4x:
702	cmp	w3, 1
703	b.lt	.Lcbcdecout
704	cmp	w3, 2
705	b.lt	.Lcbcdec1x
706	cmp	w3, 3
707	b.lt	.Lcbcdec2x
708	cmp	w3, 4
709	b.lt	.Lcbcdec3x
710	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
711	bl	sm4_encrypt_block4x
712	sub	x5, x1, 64
713	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x5], #64
714	eor	BLK0.16b, BLK0.16b, IV.16b
715	eor	BLK1.16b, BLK1.16b, TMP0.16b
716	eor	BLK2.16b, BLK2.16b, TMP1.16b
717	eor	BLK3.16b, BLK3.16b, TMP2.16b
718	mov	IV.16b, TMP3.16b
719	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
720	sub	w3, w3, #4
721	b	.Lcbcdec4x
722
723.Lcbcdec3x:
724	ld1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48
725	bl	sm4_encrypt_block4x
726	sub	x5, x1, 48
727	ld1	{TMP0.16b, TMP1.16b, TMP2.16b}, [x5], #48
728	eor	BLK0.16b, BLK0.16b, IV.16b
729	eor	BLK1.16b, BLK1.16b, TMP0.16b
730	eor	BLK2.16b, BLK2.16b, TMP1.16b
731	mov	IV.16b, TMP2.16b
732	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
733	subs	w3, w3, #3
734	b.le	.Lcbcdecout
735
736.Lcbcdec2x:
737	ld1	{BLK0.16b, BLK1.16b}, [x1], #32
738	bl	sm4_encrypt_block4x
739	sub	x5, x1, 32
740	ld1	{TMP0.16b, TMP1.16b}, [x5], #32
741	eor	BLK0.16b, BLK0.16b, IV.16b
742	eor	BLK1.16b, BLK1.16b, TMP0.16b
743	mov	IV.16b, TMP1.16b
744	st1	{BLK0.16b, BLK1.16b}, [x0], #32
745	subs	w3, w3, #2
746	b.le	.Lcbcdecout
747
748.Lcbcdec1x:
749	ld1	{BLK0.16b}, [x1], #16
750	bl	sm4_encrypt_block1x
751	sub	x5, x1, 16
752	ld1	{TMP0.16b}, [x5], #16
753	eor	BLK0.16b, BLK0.16b, IV.16b
754	mov	IV.16b, TMP0.16b
755	st1	{BLK0.16b}, [x0], #16
756
757.Lcbcdecout:
758	st1	{IV.16b}, [x4]
759	frame_pop
760	ret
761END_FUNC neon_sm4_cbc_decrypt
762
763/*
764 * void neon_sm4_ctr_encrypt(uint8_t out[], uint8_t const in[],
765 *			     uint8_t const rk[], size_t len,
766 *			     uint8_t iv[]);
767 * x0: output
768 * x1: input
769 * x2: round key
770 * w3: length
771 * x4: iv
772 */
773FUNC neon_sm4_ctr_encrypt , :
774	frame_push
775	load_sbox_matrix
776
777	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
778	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
779	lsr	w3, w3, 4
780	ldp	x7, x8, [x4]
781	rev	x7, x7
782	rev	x8, x8
783
784.Lctrloop8x:
785	cmp	w3, 8
786	b.lt	.Lctr4x
787
788	/* construct CTRs */
789	inc_le128	BLK0, x7, x8
790	inc_le128	BLK1, x7, x8
791	inc_le128	BLK2, x7, x8
792	inc_le128	BLK3, x7, x8
793	inc_le128	BLK4, x7, x8
794	inc_le128	BLK5, x7, x8
795	inc_le128	BLK6, x7, x8
796	inc_le128	BLK7, x7, x8
797	bl	sm4_encrypt_block8x
798	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x1], #64
799	ld1	{TMP4.16b, TMP5.16b, TMP6.16b, TMP7.16b}, [x1], #64
800	eor	BLK0.16b, BLK0.16b, TMP0.16b
801	eor	BLK1.16b, BLK1.16b, TMP1.16b
802	eor	BLK2.16b, BLK2.16b, TMP2.16b
803	eor	BLK3.16b, BLK3.16b, TMP3.16b
804	eor	BLK4.16b, BLK4.16b, TMP4.16b
805	eor	BLK5.16b, BLK5.16b, TMP5.16b
806	eor	BLK6.16b, BLK6.16b, TMP6.16b
807	eor	BLK7.16b, BLK7.16b, TMP7.16b
808	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
809	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
810	subs	w3, w3, #8
811	b.gt	.Lctrloop8x
812
813.Lctr4x:
814	cmp	w3, 1
815	b.lt	.Lctrout
816	cmp	w3, 2
817	b.lt	.Lctr1x
818	cmp	w3, 3
819	b.lt	.Lctr2x
820	cmp	w3, 4
821	b.lt	.Lctr3x
822	inc_le128	BLK0, x7, x8
823	inc_le128	BLK1, x7, x8
824	inc_le128	BLK2, x7, x8
825	inc_le128	BLK3, x7, x8
826	bl	sm4_encrypt_block4x
827	ld1	{TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x1], #64
828	eor	BLK0.16b, BLK0.16b, TMP0.16b
829	eor	BLK1.16b, BLK1.16b, TMP1.16b
830	eor	BLK2.16b, BLK2.16b, TMP2.16b
831	eor	BLK3.16b, BLK3.16b, TMP3.16b
832	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
833	sub	w3, w3, #4
834	b	.Lctr4x
835
836.Lctr3x:
837	inc_le128	BLK0, x7, x8
838	inc_le128	BLK1, x7, x8
839	inc_le128	BLK2, x7, x8
840	bl	sm4_encrypt_block4x
841	ld1	{TMP0.16b, TMP1.16b, TMP2.16b}, [x1], #48
842	eor	BLK0.16b, BLK0.16b, TMP0.16b
843	eor	BLK1.16b, BLK1.16b, TMP1.16b
844	eor	BLK2.16b, BLK2.16b, TMP2.16b
845	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
846	subs	w3, w3, #3
847	b.le	.Lctrout
848
849.Lctr2x:
850	inc_le128	BLK0, x7, x8
851	inc_le128	BLK1, x7, x8
852	bl	sm4_encrypt_block4x
853	ld1	{TMP0.16b, TMP1.16b}, [x1], #32
854	eor	BLK0.16b, BLK0.16b, TMP0.16b
855	eor	BLK1.16b, BLK1.16b, TMP1.16b
856	st1	{BLK0.16b, BLK1.16b}, [x0], #32
857	subs	w3, w3, #2
858	b.le	.Lctrout
859
860.Lctr1x:
861	inc_le128	BLK0, x7, x8
862	bl	sm4_encrypt_block1x
863	ld1	{TMP0.16b}, [x1], #16
864	eor	BLK0.16b, BLK0.16b, TMP0.16b
865	st1	{BLK0.16b}, [x0], #16
866
867.Lctrout:
868	rev	x7, x7
869	rev	x8, x8
870	stp	x7, x8, [x4]
871	frame_pop
872	ret
873END_FUNC neon_sm4_ctr_encrypt
874
875/*
876 * x0: output
877 * x1: input
878 * x2: round key1
879 * x3: round key2
880 * w4: blocks
881 * x26: enc/dec
882 */
883LOCAL_FUNC xts_do_cipher , :
884	stp	x29, x30, [sp, #-16]!
885	mov	x29, sp
886	load_sbox_matrix
887	ld1	{IV.16b}, [x5]
888	/* load round key2 for first tweak */
889	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x3], #64
890	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x3], #64
891	encrypt_block	IV
892	/* load round key1 for block cipher */
893	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64
894	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64
895	/* w6: remain */
896	and	w6, w4, #0x0F
897	/* w4: blocks */
898	lsr	w4, w4, 4
899	/* blocks == 0: ret */
900	cmp	w4, #1
901	b.lt	.Lxtsout
902	cmp	w6, 0
903	b.eq	.Lxtsblks
904	subs	w4, w4, #1
905	b.eq	.Lxtstail
906.Lxtsblks:
907	mov	tw0l, IV.d[0]
908	mov	tw0h, IV.d[1]
909	next_tweak	tw1l, tw1h, tw0l, tw0h
910	next_tweak	tw2l, tw2h, tw1l, tw1h
911	next_tweak	tw3l, tw3h, tw2l, tw2h
912	next_tweak	tw4l, tw4h, tw3l, tw3h
913	next_tweak	tw5l, tw5h, tw4l, tw4h
914	next_tweak	tw6l, tw6h, tw5l, tw5h
915	next_tweak	tw7l, tw7h, tw6l, tw6h
916.Lxtsloop8x:
917	cmp	w4, 8
918	b.lt	.Lxts4x
919	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
920	mov_reg_to_vec	TMP0, tw0l, tw0h
921	mov_reg_to_vec	TMP1, tw1l, tw1h
922	mov_reg_to_vec	TMP2, tw2l, tw2h
923	mov_reg_to_vec	TMP3, tw3l, tw3h
924	eor BLK0.16b, BLK0.16b, TMP0.16b
925	eor BLK1.16b, BLK1.16b, TMP1.16b
926	eor BLK2.16b, BLK2.16b, TMP2.16b
927	eor BLK3.16b, BLK3.16b, TMP3.16b
928	ld1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64
929	mov_reg_to_vec	TMP4, tw4l, tw4h
930	mov_reg_to_vec	TMP5, tw5l, tw5h
931	mov_reg_to_vec	TMP6, tw6l, tw6h
932	mov_reg_to_vec	IV, tw7l, tw7h
933	eor BLK4.16b, BLK4.16b, TMP4.16b
934	eor BLK5.16b, BLK5.16b, TMP5.16b
935	eor BLK6.16b, BLK6.16b, TMP6.16b
936	eor BLK7.16b, BLK7.16b, IV.16b
937
938	bl	sm4_encrypt_block8x
939
940	mov_reg_to_vec	TMP0, tw0l, tw0h
941	next_tweak	tw0l, tw0h, tw7l, tw7h
942	mov_reg_to_vec	TMP1, tw1l, tw1h
943	next_tweak	tw1l, tw1h, tw0l, tw0h
944	mov_reg_to_vec	TMP2, tw2l, tw2h
945	next_tweak	tw2l, tw2h, tw1l, tw1h
946	mov_reg_to_vec	TMP3, tw3l, tw3h
947	next_tweak	tw3l, tw3h, tw2l, tw2h
948	mov_reg_to_vec	TMP4, tw4l, tw4h
949	next_tweak	tw4l, tw4h, tw3l, tw3h
950	mov_reg_to_vec	TMP5, tw5l, tw5h
951	next_tweak	tw5l, tw5h, tw4l, tw4h
952	mov_reg_to_vec	TMP6, tw6l, tw6h
953	next_tweak	tw6l, tw6h, tw5l, tw5h
954	mov_reg_to_vec	IV, tw7l, tw7h
955	next_tweak	tw7l, tw7h, tw6l, tw6h
956
957	eor BLK0.16b, BLK0.16b, TMP0.16b
958	eor BLK1.16b, BLK1.16b, TMP1.16b
959	eor BLK2.16b, BLK2.16b, TMP2.16b
960	eor BLK3.16b, BLK3.16b, TMP3.16b
961	eor BLK4.16b, BLK4.16b, TMP4.16b
962	eor BLK5.16b, BLK5.16b, TMP5.16b
963	eor BLK6.16b, BLK6.16b, TMP6.16b
964	eor BLK7.16b, BLK7.16b, IV.16b
965
966	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
967	st1	{BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64
968	subs	w4, w4, #8
969	b.gt	.Lxtsloop8x
970
971.Lxts4x:
972	cmp	w4, 1
973	b.lt	.Lxtsblksout
974	cmp	w4, 2
975	b.lt	.Lxts1x
976	cmp	w4, 3
977	b.lt	.Lxts2x
978	cmp	w4, 4
979	b.lt	.Lxts3x
980	ld1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64
981	mov_reg_to_vec	BLK4, tw0l, tw0h
982	mov_reg_to_vec	BLK5, tw1l, tw1h
983	mov_reg_to_vec	BLK6, tw2l, tw2h
984	mov_reg_to_vec	IV, tw3l, tw3h
985	eor	BLK0.16b, BLK0.16b, BLK4.16b
986	eor	BLK1.16b, BLK1.16b, BLK5.16b
987	eor	BLK2.16b, BLK2.16b, BLK6.16b
988	eor	BLK3.16b, BLK3.16b, IV.16b
989	bl	sm4_encrypt_block4x
990	eor	BLK0.16b, BLK0.16b, BLK4.16b
991	eor	BLK1.16b, BLK1.16b, BLK5.16b
992	eor	BLK2.16b, BLK2.16b, BLK6.16b
993	eor	BLK3.16b, BLK3.16b, IV.16b
994	st1	{BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64
995	sub	w4, w4, #4
996
997	mov	tw0l, tw4l
998	mov	tw0h, tw4h
999	mov	tw1l, tw5l
1000	mov	tw1h, tw5h
1001	mov	tw2l, tw6l
1002	mov	tw2h, tw6h
1003	b	.Lxts4x
1004
1005.Lxts3x:
1006	ld1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48
1007	mov_reg_to_vec	BLK4, tw0l, tw0h
1008	mov_reg_to_vec	BLK5, tw1l, tw1h
1009	mov_reg_to_vec	IV, tw2l, tw2h
1010	eor	BLK0.16b, BLK0.16b, BLK4.16b
1011	eor	BLK1.16b, BLK1.16b, BLK5.16b
1012	eor	BLK2.16b, BLK2.16b, IV.16b
1013	bl	sm4_encrypt_block4x
1014	eor	BLK0.16b, BLK0.16b, BLK4.16b
1015	eor	BLK1.16b, BLK1.16b, BLK5.16b
1016	eor	BLK2.16b, BLK2.16b, IV.16b
1017	st1	{BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48
1018	subs	w4, w4, #3
1019	b.le	.Lxtsblksout
1020
1021.Lxts2x:
1022	ld1	{BLK0.16b, BLK1.16b}, [x1], #32
1023	mov_reg_to_vec	BLK4, tw0l, tw0h
1024	mov_reg_to_vec	IV, tw1l, tw1h
1025	eor	BLK0.16b, BLK0.16b, BLK4.16b
1026	eor	BLK1.16b, BLK1.16b, IV.16b
1027	bl	sm4_encrypt_block4x
1028	eor	BLK0.16b, BLK0.16b, BLK4.16b
1029	eor	BLK1.16b, BLK1.16b, IV.16b
1030	st1	{BLK0.16b, BLK1.16b}, [x0], #32
1031	subs	w4, w4, #2
1032	b.le	.Lxtsblksout
1033
1034.Lxts1x:
1035	ld1	{BLK0.16b}, [x1], #16
1036	mov_reg_to_vec	IV, tw0l, tw0h
1037	eor	BLK0.16b, BLK0.16b, IV.16b
1038	bl	sm4_encrypt_block1x
1039	eor	BLK0.16b, BLK0.16b, IV.16b
1040	st1	{BLK0.16b}, [x0], #16
1041.Lxtsblksout:
1042	cmp	w6, 0
1043	/* if encrypt some blocks with a partial block */
1044	next_tweak_vec	IV, IV
1045	b.eq	.Lxtsout
1046.Lxtstail:
1047	next_tweak_vec	TMP7, IV
1048	cmp	x26, 1
1049	b.eq	1f
1050	/* The last two tweaks IV, TMP7 need to be swapped for decryption */
1051	mov	TMP8.16b, IV.16b
1052	mov	IV.16b, TMP7.16b
1053	mov	TMP7.16b, TMP8.16b
1054	1:
1055	ld1	{BLK0.16b}, [x1], #16
1056	eor	BLK0.16b, BLK0.16b, IV.16b
1057	bl	sm4_encrypt_block1x
1058	eor	BLK0.16b, BLK0.16b, IV.16b
1059	st1	{BLK0.16b}, [x0], #16
1060	sub	x7, x0, 16
1061	10:
1062	subs	x6, x6, 1
1063	ldrb	tmpw0, [x7, x6]
1064	strb	tmpw0, [x0, x6]
1065	ldrb	tmpw0, [x1, x6]
1066	strb	tmpw0, [x7, x6]
1067	b.gt	10b
1068	ld1	{BLK0.16b}, [x7]
1069	eor	BLK0.16b, BLK0.16b, TMP7.16b
1070	bl	sm4_encrypt_block1x
1071	eor	BLK0.16b, BLK0.16b, TMP7.16b
1072	st1	{BLK0.16b}, [x7]
1073
1074.Lxtsout:
1075	/* load round key2 for last tweak */
1076	sub	x3, x3, #128
1077	ld1	{RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x3], #64
1078	ld1	{RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x3], #64
1079	/* decrypt last tweak for next update */
1080	decrypt_block	IV
1081	st1	{IV.16b}, [x5]
1082	ldp x29, x30, [sp], #16
1083	ret
1084END_FUNC xts_do_cipher
1085
1086/*
1087 * void neon_sm4_xts_encrypt(uint8_t out[], uint8_t const in[],
1088 * 			     uint8_t const rk1[], uint8_t const rk2[],
1089 * 			     size_t len, uint8_t iv[])
1090 * x0: output
1091 * x1: input
1092 * x2: round key1
1093 * x3: round key2
1094 * w4: len
1095 * x5: iv
1096 */
1097FUNC neon_sm4_xts_encrypt , :
1098	frame_push
1099	mov	x26, 1
1100	bl	xts_do_cipher
1101	frame_pop
1102	ret
1103END_FUNC neon_sm4_xts_encrypt
1104
1105/*
1106 * void neon_sm4_xts_decrypt(uint8_t out[], uint8_t const in[],
1107 * 			     uint8_t const rk1[], uint8_t const rk2[],
1108 * 			     size_t len, uint8_t iv[])
1109 * x0: output
1110 * x1: input
1111 * x2: round key1
1112 * x3: round key2
1113 * w4: len
1114 * x5: iv
1115 */
1116FUNC neon_sm4_xts_decrypt , :
1117	frame_push
1118	mov	x26, 0
1119	bl	xts_do_cipher
1120	frame_pop
1121	ret
1122END_FUNC neon_sm4_xts_decrypt
1123