xref: /optee_os/core/arch/arm/crypto/sha3_armv8a_ce_a64.S (revision 4c39d4d10e4b797bec83fb18e4b7c39454188759)
1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
4 * Copyright (c) 2023 Linaro Limited
5 */
6
7/* Core SHA-3 transform using v8 Crypto Extensions */
8
9#include <asm.S>
10#include <arm64_macros.S>
11
12	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,\
13		21,22,23,24,25,26,27,28,29,30,31
14	.set	.Lv\b\().2d, \b
15	.set	.Lv\b\().16b, \b
16	.endr
17
18	/*
19	 * ARMv8.2 Crypto Extensions instructions
20	 */
21	.macro	eor3, rd, rn, rm, ra
22	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | \
23		(.L\rm << 16)
24	.endm
25
26	.macro	rax1, rd, rn, rm
27	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
28	.endm
29
30	.macro	bcax, rd, rn, rm, ra
31	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | \
32		(.L\rm << 16)
33	.endm
34
35	.macro	xar, rd, rn, rm, imm6
36	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | \
37		(.L\rm << 16)
38	.endm
39
40	/*
41	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
42	 *			 int dg_size)
43	 */
44	.text
45FUNC sha3_ce_transform , :
46	/* load state */
47	add	x8, x0, #32
48	ld1	{ v0.1d- v3.1d}, [x0]
49	ld1	{ v4.1d- v7.1d}, [x8], #32
50	ld1	{ v8.1d-v11.1d}, [x8], #32
51	ld1	{v12.1d-v15.1d}, [x8], #32
52	ld1	{v16.1d-v19.1d}, [x8], #32
53	ld1	{v20.1d-v23.1d}, [x8], #32
54	ld1	{v24.1d}, [x8]
55
560:	sub	w2, w2, #1
57	mov	w8, #24
58	adr_l	x9, .Lsha3_rcon
59
60	/* load input */
61	ld1	{v25.8b-v28.8b}, [x1], #32
62	ld1	{v29.8b-v31.8b}, [x1], #24
63	eor	v0.8b, v0.8b, v25.8b
64	eor	v1.8b, v1.8b, v26.8b
65	eor	v2.8b, v2.8b, v27.8b
66	eor	v3.8b, v3.8b, v28.8b
67	eor	v4.8b, v4.8b, v29.8b
68	eor	v5.8b, v5.8b, v30.8b
69	eor	v6.8b, v6.8b, v31.8b
70
71	/* Bit 6 set? -> SHA3-512 */
72	tbnz	x3, #6, 3f
73
74	/* SHA3-384, SHA3-256, SHA3-224 or SHA3-128 */
75	ld1	{v25.8b-v28.8b}, [x1], #32
76	ld1	{v29.8b-v30.8b}, [x1], #16
77	eor	 v7.8b,  v7.8b, v25.8b
78	eor	 v8.8b,  v8.8b, v26.8b
79	eor	 v9.8b,  v9.8b, v27.8b
80	eor	v10.8b, v10.8b, v28.8b
81	eor	v11.8b, v11.8b, v29.8b
82	eor	v12.8b, v12.8b, v30.8b
83
84	/* bit 4 set? -> SHA3-384, SHA3-224 or SHA3-128 */
85	tbnz	x3, #4, 1f
86
87	/* SHA3-256: digest size 32 bytes, block size 136 bytes */
88	ld1	{v25.8b-v28.8b}, [x1], #32
89	eor	v13.8b, v13.8b, v25.8b
90	eor	v14.8b, v14.8b, v26.8b
91	eor	v15.8b, v15.8b, v27.8b
92	eor	v16.8b, v16.8b, v28.8b
93	b	4f
94
95	/* bit 5 set? -> SHA-384 */
961:	tbnz	x3, #5, 4f
97
98	/* SHA3-224 or SHA3-128 */
99	ld1	{v25.8b-v28.8b}, [x1], #32
100	eor	v13.8b, v13.8b, v25.8b
101	eor	v14.8b, v14.8b, v26.8b
102	eor	v15.8b, v15.8b, v27.8b
103	eor	v16.8b, v16.8b, v28.8b
104
105	/* bit 2 set? -> SHA-224 */
106	tbnz	x3, #2, 2f
107
108	/* SHA3-128: digest size 16 bytes, block size 168 bytes */
109	ld1	{v25.8b-v28.8b}, [x1], #32
110	eor	v17.8b, v17.8b, v25.8b
111	eor	v18.8b, v18.8b, v26.8b
112	eor	v19.8b, v19.8b, v27.8b
113	eor	v20.8b, v20.8b, v28.8b
114	b	4f
115
116	/* SHA3-224: digest size 28 bytes, block size 144 bytes */
1172:	ld1	{v29.8b}, [x1], #8
118	eor	v17.8b, v17.8b, v29.8b
119	b	4f
120
121	/* SHA3-512: digest size 64 bytes , block size 72 bytes */
1223:	ld1	{v25.8b-v26.8b}, [x1], #16
123	eor	 v7.8b,  v7.8b, v25.8b
124	eor	 v8.8b,  v8.8b, v26.8b
125
1264:	sub	w8, w8, #1
127
128	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
129	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
130	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
131	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
132	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
133	eor3	v29.16b, v29.16b, v19.16b, v24.16b
134	eor3	v26.16b, v26.16b, v16.16b, v21.16b
135	eor3	v28.16b, v28.16b, v18.16b, v23.16b
136	eor3	v25.16b, v25.16b, v15.16b, v20.16b
137	eor3	v27.16b, v27.16b, v17.16b, v22.16b
138
139	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
140	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
141	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
142	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
143	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
144
145	eor	 v0.16b,  v0.16b, v30.16b
146	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
147	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
148	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
149	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
150	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
151	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
152	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
153	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
154	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
155	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
156	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
157	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
158	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
159	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
160	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
161	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
162	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
163	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
164	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
165	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
166	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
167	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
168	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
169	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
170
171	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
172	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
173	bcax	v22.16b, v22.16b, v24.16b, v23.16b
174	bcax	v23.16b, v23.16b, v31.16b, v24.16b
175	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
176
177	ld1r	{v31.2d}, [x9], #8
178
179	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
180	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
181	bcax	v19.16b, v19.16b, v16.16b, v15.16b
182	bcax	v15.16b, v15.16b, v25.16b, v16.16b
183	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
184
185	bcax	v10.16b, v29.16b, v12.16b, v26.16b
186	bcax	v11.16b, v26.16b, v13.16b, v12.16b
187	bcax	v12.16b, v12.16b, v14.16b, v13.16b
188	bcax	v13.16b, v13.16b, v29.16b, v14.16b
189	bcax	v14.16b, v14.16b, v26.16b, v29.16b
190
191	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
192	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
193	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
194	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
195	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
196
197	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
198	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
199	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
200	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
201	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
202
203	eor	 v0.16b,  v0.16b, v31.16b
204
205	cbnz	w8, 4b
206	cbnz	w2, 0b
207
208	/* save state */
209	st1	{ v0.1d- v3.1d}, [x0], #32
210	st1	{ v4.1d- v7.1d}, [x0], #32
211	st1	{ v8.1d-v11.1d}, [x0], #32
212	st1	{v12.1d-v15.1d}, [x0], #32
213	st1	{v16.1d-v19.1d}, [x0], #32
214	st1	{v20.1d-v23.1d}, [x0], #32
215	st1	{v24.1d}, [x0]
216	mov	w0, w2
217	ret
218END_FUNC sha3_ce_transform
219
220	.section	".rodata", "a"
221	.align		8
222LOCAL_DATA .Lsha3_rcon , :
223	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
224	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
225	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
226	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
227	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
228	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
229	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
230	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
231
232BTI(emit_aarch64_feature_1_and     GNU_PROPERTY_AARCH64_FEATURE_1_BTI)
233