xref: /OK3568_Linux_fs/kernel/arch/arm64/crypto/ghash-ce-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/assembler.h>
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun	SHASH		.req	v0
12*4882a593Smuzhiyun	SHASH2		.req	v1
13*4882a593Smuzhiyun	T1		.req	v2
14*4882a593Smuzhiyun	T2		.req	v3
15*4882a593Smuzhiyun	MASK		.req	v4
16*4882a593Smuzhiyun	XM		.req	v5
17*4882a593Smuzhiyun	XL		.req	v6
18*4882a593Smuzhiyun	XH		.req	v7
19*4882a593Smuzhiyun	IN1		.req	v7
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun	k00_16		.req	v8
22*4882a593Smuzhiyun	k32_48		.req	v9
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun	t3		.req	v10
25*4882a593Smuzhiyun	t4		.req	v11
26*4882a593Smuzhiyun	t5		.req	v12
27*4882a593Smuzhiyun	t6		.req	v13
28*4882a593Smuzhiyun	t7		.req	v14
29*4882a593Smuzhiyun	t8		.req	v15
30*4882a593Smuzhiyun	t9		.req	v16
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun	perm1		.req	v17
33*4882a593Smuzhiyun	perm2		.req	v18
34*4882a593Smuzhiyun	perm3		.req	v19
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun	sh1		.req	v20
37*4882a593Smuzhiyun	sh2		.req	v21
38*4882a593Smuzhiyun	sh3		.req	v22
39*4882a593Smuzhiyun	sh4		.req	v23
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun	ss1		.req	v24
42*4882a593Smuzhiyun	ss2		.req	v25
43*4882a593Smuzhiyun	ss3		.req	v26
44*4882a593Smuzhiyun	ss4		.req	v27
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun	XL2		.req	v8
47*4882a593Smuzhiyun	XM2		.req	v9
48*4882a593Smuzhiyun	XH2		.req	v10
49*4882a593Smuzhiyun	XL3		.req	v11
50*4882a593Smuzhiyun	XM3		.req	v12
51*4882a593Smuzhiyun	XH3		.req	v13
52*4882a593Smuzhiyun	TT3		.req	v14
53*4882a593Smuzhiyun	TT4		.req	v15
54*4882a593Smuzhiyun	HH		.req	v16
55*4882a593Smuzhiyun	HH3		.req	v17
56*4882a593Smuzhiyun	HH4		.req	v18
57*4882a593Smuzhiyun	HH34		.req	v19
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun	.text
60*4882a593Smuzhiyun	.arch		armv8-a+crypto
61*4882a593Smuzhiyun
62*4882a593Smuzhiyun	.macro		__pmull_p64, rd, rn, rm
63*4882a593Smuzhiyun	pmull		\rd\().1q, \rn\().1d, \rm\().1d
64*4882a593Smuzhiyun	.endm
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun	.macro		__pmull2_p64, rd, rn, rm
67*4882a593Smuzhiyun	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
68*4882a593Smuzhiyun	.endm
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	.macro		__pmull_p8, rq, ad, bd
71*4882a593Smuzhiyun	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
72*4882a593Smuzhiyun	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
73*4882a593Smuzhiyun	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun	__pmull_p8_\bd	\rq, \ad
76*4882a593Smuzhiyun	.endm
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun	.macro		__pmull2_p8, rq, ad, bd
79*4882a593Smuzhiyun	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
80*4882a593Smuzhiyun	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
81*4882a593Smuzhiyun	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	__pmull2_p8_\bd	\rq, \ad
84*4882a593Smuzhiyun	.endm
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun	.macro		__pmull_p8_SHASH, rq, ad
87*4882a593Smuzhiyun	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88*4882a593Smuzhiyun	.endm
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun	.macro		__pmull_p8_SHASH2, rq, ad
91*4882a593Smuzhiyun	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92*4882a593Smuzhiyun	.endm
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun	.macro		__pmull2_p8_SHASH, rq, ad
95*4882a593Smuzhiyun	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96*4882a593Smuzhiyun	.endm
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99*4882a593Smuzhiyun	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
100*4882a593Smuzhiyun	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
101*4882a593Smuzhiyun	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
102*4882a593Smuzhiyun	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
103*4882a593Smuzhiyun	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
104*4882a593Smuzhiyun	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
105*4882a593Smuzhiyun	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
106*4882a593Smuzhiyun	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun	eor		t3.16b, t3.16b, t4.16b			// L = E + F
109*4882a593Smuzhiyun	eor		t5.16b, t5.16b, t6.16b			// M = G + H
110*4882a593Smuzhiyun	eor		t7.16b, t7.16b, t8.16b			// N = I + J
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun	uzp1		t4.2d, t3.2d, t5.2d
113*4882a593Smuzhiyun	uzp2		t3.2d, t3.2d, t5.2d
114*4882a593Smuzhiyun	uzp1		t6.2d, t7.2d, t9.2d
115*4882a593Smuzhiyun	uzp2		t7.2d, t7.2d, t9.2d
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun	// t3 = (L) (P0 + P1) << 8
118*4882a593Smuzhiyun	// t5 = (M) (P2 + P3) << 16
119*4882a593Smuzhiyun	eor		t4.16b, t4.16b, t3.16b
120*4882a593Smuzhiyun	and		t3.16b, t3.16b, k32_48.16b
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun	// t7 = (N) (P4 + P5) << 24
123*4882a593Smuzhiyun	// t9 = (K) (P6 + P7) << 32
124*4882a593Smuzhiyun	eor		t6.16b, t6.16b, t7.16b
125*4882a593Smuzhiyun	and		t7.16b, t7.16b, k00_16.16b
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun	eor		t4.16b, t4.16b, t3.16b
128*4882a593Smuzhiyun	eor		t6.16b, t6.16b, t7.16b
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun	zip2		t5.2d, t4.2d, t3.2d
131*4882a593Smuzhiyun	zip1		t3.2d, t4.2d, t3.2d
132*4882a593Smuzhiyun	zip2		t9.2d, t6.2d, t7.2d
133*4882a593Smuzhiyun	zip1		t7.2d, t6.2d, t7.2d
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun	ext		t3.16b, t3.16b, t3.16b, #15
136*4882a593Smuzhiyun	ext		t5.16b, t5.16b, t5.16b, #14
137*4882a593Smuzhiyun	ext		t7.16b, t7.16b, t7.16b, #13
138*4882a593Smuzhiyun	ext		t9.16b, t9.16b, t9.16b, #12
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun	eor		t3.16b, t3.16b, t5.16b
141*4882a593Smuzhiyun	eor		t7.16b, t7.16b, t9.16b
142*4882a593Smuzhiyun	eor		\rq\().16b, \rq\().16b, t3.16b
143*4882a593Smuzhiyun	eor		\rq\().16b, \rq\().16b, t7.16b
144*4882a593Smuzhiyun	.endm
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun	.macro		__pmull_pre_p64
147*4882a593Smuzhiyun	add		x8, x3, #16
148*4882a593Smuzhiyun	ld1		{HH.2d-HH4.2d}, [x8]
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun	trn1		SHASH2.2d, SHASH.2d, HH.2d
151*4882a593Smuzhiyun	trn2		T1.2d, SHASH.2d, HH.2d
152*4882a593Smuzhiyun	eor		SHASH2.16b, SHASH2.16b, T1.16b
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun	trn1		HH34.2d, HH3.2d, HH4.2d
155*4882a593Smuzhiyun	trn2		T1.2d, HH3.2d, HH4.2d
156*4882a593Smuzhiyun	eor		HH34.16b, HH34.16b, T1.16b
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun	movi		MASK.16b, #0xe1
159*4882a593Smuzhiyun	shl		MASK.2d, MASK.2d, #57
160*4882a593Smuzhiyun	.endm
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun	.macro		__pmull_pre_p8
163*4882a593Smuzhiyun	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
164*4882a593Smuzhiyun	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun	// k00_16 := 0x0000000000000000_000000000000ffff
167*4882a593Smuzhiyun	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
168*4882a593Smuzhiyun	movi		k32_48.2d, #0xffffffff
169*4882a593Smuzhiyun	mov		k32_48.h[2], k32_48.h[0]
170*4882a593Smuzhiyun	ushr		k00_16.2d, k32_48.2d, #32
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun	// prepare the permutation vectors
173*4882a593Smuzhiyun	mov_q		x5, 0x080f0e0d0c0b0a09
174*4882a593Smuzhiyun	movi		T1.8b, #8
175*4882a593Smuzhiyun	dup		perm1.2d, x5
176*4882a593Smuzhiyun	eor		perm1.16b, perm1.16b, T1.16b
177*4882a593Smuzhiyun	ushr		perm2.2d, perm1.2d, #8
178*4882a593Smuzhiyun	ushr		perm3.2d, perm1.2d, #16
179*4882a593Smuzhiyun	ushr		T1.2d, perm1.2d, #24
180*4882a593Smuzhiyun	sli		perm2.2d, perm1.2d, #56
181*4882a593Smuzhiyun	sli		perm3.2d, perm1.2d, #48
182*4882a593Smuzhiyun	sli		T1.2d, perm1.2d, #40
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun	// precompute loop invariants
185*4882a593Smuzhiyun	tbl		sh1.16b, {SHASH.16b}, perm1.16b
186*4882a593Smuzhiyun	tbl		sh2.16b, {SHASH.16b}, perm2.16b
187*4882a593Smuzhiyun	tbl		sh3.16b, {SHASH.16b}, perm3.16b
188*4882a593Smuzhiyun	tbl		sh4.16b, {SHASH.16b}, T1.16b
189*4882a593Smuzhiyun	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
190*4882a593Smuzhiyun	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
191*4882a593Smuzhiyun	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
192*4882a593Smuzhiyun	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
193*4882a593Smuzhiyun	.endm
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun	//
196*4882a593Smuzhiyun	// PMULL (64x64->128) based reduction for CPUs that can do
197*4882a593Smuzhiyun	// it in a single instruction.
198*4882a593Smuzhiyun	//
199*4882a593Smuzhiyun	.macro		__pmull_reduce_p64
200*4882a593Smuzhiyun	pmull		T2.1q, XL.1d, MASK.1d
201*4882a593Smuzhiyun	eor		XM.16b, XM.16b, T1.16b
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun	mov		XH.d[0], XM.d[1]
204*4882a593Smuzhiyun	mov		XM.d[1], XL.d[0]
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun	eor		XL.16b, XM.16b, T2.16b
207*4882a593Smuzhiyun	ext		T2.16b, XL.16b, XL.16b, #8
208*4882a593Smuzhiyun	pmull		XL.1q, XL.1d, MASK.1d
209*4882a593Smuzhiyun	.endm
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun	//
212*4882a593Smuzhiyun	// Alternative reduction for CPUs that lack support for the
213*4882a593Smuzhiyun	// 64x64->128 PMULL instruction
214*4882a593Smuzhiyun	//
215*4882a593Smuzhiyun	.macro		__pmull_reduce_p8
216*4882a593Smuzhiyun	eor		XM.16b, XM.16b, T1.16b
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun	mov		XL.d[1], XM.d[0]
219*4882a593Smuzhiyun	mov		XH.d[0], XM.d[1]
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	shl		T1.2d, XL.2d, #57
222*4882a593Smuzhiyun	shl		T2.2d, XL.2d, #62
223*4882a593Smuzhiyun	eor		T2.16b, T2.16b, T1.16b
224*4882a593Smuzhiyun	shl		T1.2d, XL.2d, #63
225*4882a593Smuzhiyun	eor		T2.16b, T2.16b, T1.16b
226*4882a593Smuzhiyun	ext		T1.16b, XL.16b, XH.16b, #8
227*4882a593Smuzhiyun	eor		T2.16b, T2.16b, T1.16b
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun	mov		XL.d[1], T2.d[0]
230*4882a593Smuzhiyun	mov		XH.d[0], T2.d[1]
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun	ushr		T2.2d, XL.2d, #1
233*4882a593Smuzhiyun	eor		XH.16b, XH.16b, XL.16b
234*4882a593Smuzhiyun	eor		XL.16b, XL.16b, T2.16b
235*4882a593Smuzhiyun	ushr		T2.2d, T2.2d, #6
236*4882a593Smuzhiyun	ushr		XL.2d, XL.2d, #1
237*4882a593Smuzhiyun	.endm
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun	.macro		__pmull_ghash, pn
240*4882a593Smuzhiyun	ld1		{SHASH.2d}, [x3]
241*4882a593Smuzhiyun	ld1		{XL.2d}, [x1]
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun	__pmull_pre_\pn
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	/* do the head block first, if supplied */
246*4882a593Smuzhiyun	cbz		x4, 0f
247*4882a593Smuzhiyun	ld1		{T1.2d}, [x4]
248*4882a593Smuzhiyun	mov		x4, xzr
249*4882a593Smuzhiyun	b		3f
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun0:	.ifc		\pn, p64
252*4882a593Smuzhiyun	tbnz		w0, #0, 2f		// skip until #blocks is a
253*4882a593Smuzhiyun	tbnz		w0, #1, 2f		// round multiple of 4
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun	sub		w0, w0, #4
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun	rev64		T1.16b, XM3.16b
260*4882a593Smuzhiyun	rev64		T2.16b, XH3.16b
261*4882a593Smuzhiyun	rev64		TT4.16b, TT4.16b
262*4882a593Smuzhiyun	rev64		TT3.16b, TT3.16b
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	ext		IN1.16b, TT4.16b, TT4.16b, #8
265*4882a593Smuzhiyun	ext		XL3.16b, TT3.16b, TT3.16b, #8
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	eor		TT4.16b, TT4.16b, IN1.16b
268*4882a593Smuzhiyun	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
269*4882a593Smuzhiyun	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
270*4882a593Smuzhiyun	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun	eor		TT3.16b, TT3.16b, XL3.16b
273*4882a593Smuzhiyun	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
274*4882a593Smuzhiyun	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
275*4882a593Smuzhiyun	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	ext		IN1.16b, T2.16b, T2.16b, #8
278*4882a593Smuzhiyun	eor		XL2.16b, XL2.16b, XL3.16b
279*4882a593Smuzhiyun	eor		XH2.16b, XH2.16b, XH3.16b
280*4882a593Smuzhiyun	eor		XM2.16b, XM2.16b, XM3.16b
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun	eor		T2.16b, T2.16b, IN1.16b
283*4882a593Smuzhiyun	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
284*4882a593Smuzhiyun	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
285*4882a593Smuzhiyun	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun	eor		XL2.16b, XL2.16b, XL3.16b
288*4882a593Smuzhiyun	eor		XH2.16b, XH2.16b, XH3.16b
289*4882a593Smuzhiyun	eor		XM2.16b, XM2.16b, XM3.16b
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun	ext		IN1.16b, T1.16b, T1.16b, #8
292*4882a593Smuzhiyun	ext		TT3.16b, XL.16b, XL.16b, #8
293*4882a593Smuzhiyun	eor		XL.16b, XL.16b, IN1.16b
294*4882a593Smuzhiyun	eor		T1.16b, T1.16b, TT3.16b
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
297*4882a593Smuzhiyun	eor		T1.16b, T1.16b, XL.16b
298*4882a593Smuzhiyun	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
299*4882a593Smuzhiyun	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun	eor		XL.16b, XL.16b, XL2.16b
302*4882a593Smuzhiyun	eor		XH.16b, XH.16b, XH2.16b
303*4882a593Smuzhiyun	eor		XM.16b, XM.16b, XM2.16b
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	eor		T2.16b, XL.16b, XH.16b
306*4882a593Smuzhiyun	ext		T1.16b, XL.16b, XH.16b, #8
307*4882a593Smuzhiyun	eor		XM.16b, XM.16b, T2.16b
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun	__pmull_reduce_p64
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun	eor		T2.16b, T2.16b, XH.16b
312*4882a593Smuzhiyun	eor		XL.16b, XL.16b, T2.16b
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun	cbz		w0, 5f
315*4882a593Smuzhiyun	b		1b
316*4882a593Smuzhiyun	.endif
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun2:	ld1		{T1.2d}, [x2], #16
319*4882a593Smuzhiyun	sub		w0, w0, #1
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun3:	/* multiply XL by SHASH in GF(2^128) */
322*4882a593SmuzhiyunCPU_LE(	rev64		T1.16b, T1.16b	)
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun	ext		T2.16b, XL.16b, XL.16b, #8
325*4882a593Smuzhiyun	ext		IN1.16b, T1.16b, T1.16b, #8
326*4882a593Smuzhiyun	eor		T1.16b, T1.16b, T2.16b
327*4882a593Smuzhiyun	eor		XL.16b, XL.16b, IN1.16b
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
330*4882a593Smuzhiyun	eor		T1.16b, T1.16b, XL.16b
331*4882a593Smuzhiyun	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
332*4882a593Smuzhiyun	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun4:	eor		T2.16b, XL.16b, XH.16b
335*4882a593Smuzhiyun	ext		T1.16b, XL.16b, XH.16b, #8
336*4882a593Smuzhiyun	eor		XM.16b, XM.16b, T2.16b
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun	__pmull_reduce_\pn
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun	eor		T2.16b, T2.16b, XH.16b
341*4882a593Smuzhiyun	eor		XL.16b, XL.16b, T2.16b
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun	cbnz		w0, 0b
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun5:	st1		{XL.2d}, [x1]
346*4882a593Smuzhiyun	ret
347*4882a593Smuzhiyun	.endm
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun	/*
350*4882a593Smuzhiyun	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351*4882a593Smuzhiyun	 *			   struct ghash_key const *k, const char *head)
352*4882a593Smuzhiyun	 */
353*4882a593SmuzhiyunSYM_FUNC_START(pmull_ghash_update_p64)
354*4882a593Smuzhiyun	__pmull_ghash	p64
355*4882a593SmuzhiyunSYM_FUNC_END(pmull_ghash_update_p64)
356*4882a593Smuzhiyun
357*4882a593SmuzhiyunSYM_FUNC_START(pmull_ghash_update_p8)
358*4882a593Smuzhiyun	__pmull_ghash	p8
359*4882a593SmuzhiyunSYM_FUNC_END(pmull_ghash_update_p8)
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun	KS0		.req	v8
362*4882a593Smuzhiyun	KS1		.req	v9
363*4882a593Smuzhiyun	KS2		.req	v10
364*4882a593Smuzhiyun	KS3		.req	v11
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun	INP0		.req	v21
367*4882a593Smuzhiyun	INP1		.req	v22
368*4882a593Smuzhiyun	INP2		.req	v23
369*4882a593Smuzhiyun	INP3		.req	v24
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun	K0		.req	v25
372*4882a593Smuzhiyun	K1		.req	v26
373*4882a593Smuzhiyun	K2		.req	v27
374*4882a593Smuzhiyun	K3		.req	v28
375*4882a593Smuzhiyun	K4		.req	v12
376*4882a593Smuzhiyun	K5		.req	v13
377*4882a593Smuzhiyun	K6		.req	v4
378*4882a593Smuzhiyun	K7		.req	v5
379*4882a593Smuzhiyun	K8		.req	v14
380*4882a593Smuzhiyun	K9		.req	v15
381*4882a593Smuzhiyun	KK		.req	v29
382*4882a593Smuzhiyun	KL		.req	v30
383*4882a593Smuzhiyun	KM		.req	v31
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun	.macro		load_round_keys, rounds, rk, tmp
386*4882a593Smuzhiyun	add		\tmp, \rk, #64
387*4882a593Smuzhiyun	ld1		{K0.4s-K3.4s}, [\rk]
388*4882a593Smuzhiyun	ld1		{K4.4s-K5.4s}, [\tmp]
389*4882a593Smuzhiyun	add		\tmp, \rk, \rounds, lsl #4
390*4882a593Smuzhiyun	sub		\tmp, \tmp, #32
391*4882a593Smuzhiyun	ld1		{KK.4s-KM.4s}, [\tmp]
392*4882a593Smuzhiyun	.endm
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun	.macro		enc_round, state, key
395*4882a593Smuzhiyun	aese		\state\().16b, \key\().16b
396*4882a593Smuzhiyun	aesmc		\state\().16b, \state\().16b
397*4882a593Smuzhiyun	.endm
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun	.macro		enc_qround, s0, s1, s2, s3, key
400*4882a593Smuzhiyun	enc_round	\s0, \key
401*4882a593Smuzhiyun	enc_round	\s1, \key
402*4882a593Smuzhiyun	enc_round	\s2, \key
403*4882a593Smuzhiyun	enc_round	\s3, \key
404*4882a593Smuzhiyun	.endm
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun	.macro		enc_block, state, rounds, rk, tmp
407*4882a593Smuzhiyun	add		\tmp, \rk, #96
408*4882a593Smuzhiyun	ld1		{K6.4s-K7.4s}, [\tmp], #32
409*4882a593Smuzhiyun	.irp		key, K0, K1, K2, K3, K4 K5
410*4882a593Smuzhiyun	enc_round	\state, \key
411*4882a593Smuzhiyun	.endr
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun	tbnz		\rounds, #2, .Lnot128_\@
414*4882a593Smuzhiyun.Lout256_\@:
415*4882a593Smuzhiyun	enc_round	\state, K6
416*4882a593Smuzhiyun	enc_round	\state, K7
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun.Lout192_\@:
419*4882a593Smuzhiyun	enc_round	\state, KK
420*4882a593Smuzhiyun	aese		\state\().16b, KL.16b
421*4882a593Smuzhiyun	eor		\state\().16b, \state\().16b, KM.16b
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun	.subsection	1
424*4882a593Smuzhiyun.Lnot128_\@:
425*4882a593Smuzhiyun	ld1		{K8.4s-K9.4s}, [\tmp], #32
426*4882a593Smuzhiyun	enc_round	\state, K6
427*4882a593Smuzhiyun	enc_round	\state, K7
428*4882a593Smuzhiyun	ld1		{K6.4s-K7.4s}, [\tmp]
429*4882a593Smuzhiyun	enc_round	\state, K8
430*4882a593Smuzhiyun	enc_round	\state, K9
431*4882a593Smuzhiyun	tbz		\rounds, #1, .Lout192_\@
432*4882a593Smuzhiyun	b		.Lout256_\@
433*4882a593Smuzhiyun	.previous
434*4882a593Smuzhiyun	.endm
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun	.align		6
437*4882a593Smuzhiyun	.macro		pmull_gcm_do_crypt, enc
438*4882a593Smuzhiyun	stp		x29, x30, [sp, #-32]!
439*4882a593Smuzhiyun	mov		x29, sp
440*4882a593Smuzhiyun	str		x19, [sp, #24]
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun	load_round_keys	x7, x6, x8
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun	ld1		{SHASH.2d}, [x3], #16
445*4882a593Smuzhiyun	ld1		{HH.2d-HH4.2d}, [x3]
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun	trn1		SHASH2.2d, SHASH.2d, HH.2d
448*4882a593Smuzhiyun	trn2		T1.2d, SHASH.2d, HH.2d
449*4882a593Smuzhiyun	eor		SHASH2.16b, SHASH2.16b, T1.16b
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun	trn1		HH34.2d, HH3.2d, HH4.2d
452*4882a593Smuzhiyun	trn2		T1.2d, HH3.2d, HH4.2d
453*4882a593Smuzhiyun	eor		HH34.16b, HH34.16b, T1.16b
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun	ld1		{XL.2d}, [x4]
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun	cbz		x0, 3f				// tag only?
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun	ldr		w8, [x5, #12]			// load lower counter
460*4882a593SmuzhiyunCPU_LE(	rev		w8, w8		)
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun0:	mov		w9, #4				// max blocks per round
463*4882a593Smuzhiyun	add		x10, x0, #0xf
464*4882a593Smuzhiyun	lsr		x10, x10, #4			// remaining blocks
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun	subs		x0, x0, #64
467*4882a593Smuzhiyun	csel		w9, w10, w9, mi
468*4882a593Smuzhiyun	add		w8, w8, w9
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun	bmi		1f
471*4882a593Smuzhiyun	ld1		{INP0.16b-INP3.16b}, [x2], #64
472*4882a593Smuzhiyun	.subsection	1
473*4882a593Smuzhiyun	/*
474*4882a593Smuzhiyun	 * Populate the four input registers right to left with up to 63 bytes
475*4882a593Smuzhiyun	 * of data, using overlapping loads to avoid branches.
476*4882a593Smuzhiyun	 *
477*4882a593Smuzhiyun	 *                INP0     INP1     INP2     INP3
478*4882a593Smuzhiyun	 *  1 byte     |        |        |        |x       |
479*4882a593Smuzhiyun	 * 16 bytes    |        |        |        |xxxxxxxx|
480*4882a593Smuzhiyun	 * 17 bytes    |        |        |xxxxxxxx|x       |
481*4882a593Smuzhiyun	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
482*4882a593Smuzhiyun	 * etc etc
483*4882a593Smuzhiyun	 *
484*4882a593Smuzhiyun	 * Note that this code may read up to 15 bytes before the start of
485*4882a593Smuzhiyun	 * the input. It is up to the calling code to ensure this is safe if
486*4882a593Smuzhiyun	 * this happens in the first iteration of the loop (i.e., when the
487*4882a593Smuzhiyun	 * input size is < 16 bytes)
488*4882a593Smuzhiyun	 */
489*4882a593Smuzhiyun1:	mov		x15, #16
490*4882a593Smuzhiyun	ands		x19, x0, #0xf
491*4882a593Smuzhiyun	csel		x19, x19, x15, ne
492*4882a593Smuzhiyun	adr_l		x17, .Lpermute_table + 16
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun	sub		x11, x15, x19
495*4882a593Smuzhiyun	add		x12, x17, x11
496*4882a593Smuzhiyun	sub		x17, x17, x11
497*4882a593Smuzhiyun	ld1		{T1.16b}, [x12]
498*4882a593Smuzhiyun	sub		x10, x1, x11
499*4882a593Smuzhiyun	sub		x11, x2, x11
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun	cmp		x0, #-16
502*4882a593Smuzhiyun	csel		x14, x15, xzr, gt
503*4882a593Smuzhiyun	cmp		x0, #-32
504*4882a593Smuzhiyun	csel		x15, x15, xzr, gt
505*4882a593Smuzhiyun	cmp		x0, #-48
506*4882a593Smuzhiyun	csel		x16, x19, xzr, gt
507*4882a593Smuzhiyun	csel		x1, x1, x10, gt
508*4882a593Smuzhiyun	csel		x2, x2, x11, gt
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun	ld1		{INP0.16b}, [x2], x14
511*4882a593Smuzhiyun	ld1		{INP1.16b}, [x2], x15
512*4882a593Smuzhiyun	ld1		{INP2.16b}, [x2], x16
513*4882a593Smuzhiyun	ld1		{INP3.16b}, [x2]
514*4882a593Smuzhiyun	tbl		INP3.16b, {INP3.16b}, T1.16b
515*4882a593Smuzhiyun	b		2f
516*4882a593Smuzhiyun	.previous
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun2:	.if		\enc == 0
519*4882a593Smuzhiyun	bl		pmull_gcm_ghash_4x
520*4882a593Smuzhiyun	.endif
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun	bl		pmull_gcm_enc_4x
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun	tbnz		x0, #63, 6f
525*4882a593Smuzhiyun	st1		{INP0.16b-INP3.16b}, [x1], #64
526*4882a593Smuzhiyun	.if		\enc == 1
527*4882a593Smuzhiyun	bl		pmull_gcm_ghash_4x
528*4882a593Smuzhiyun	.endif
529*4882a593Smuzhiyun	bne		0b
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun3:	ldp		x19, x10, [sp, #24]
532*4882a593Smuzhiyun	cbz		x10, 5f				// output tag?
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun	ld1		{INP3.16b}, [x10]		// load lengths[]
535*4882a593Smuzhiyun	mov		w9, #1
536*4882a593Smuzhiyun	bl		pmull_gcm_ghash_4x
537*4882a593Smuzhiyun
538*4882a593Smuzhiyun	mov		w11, #(0x1 << 24)		// BE '1U'
539*4882a593Smuzhiyun	ld1		{KS0.16b}, [x5]
540*4882a593Smuzhiyun	mov		KS0.s[3], w11
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun	enc_block	KS0, x7, x6, x12
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun	ext		XL.16b, XL.16b, XL.16b, #8
545*4882a593Smuzhiyun	rev64		XL.16b, XL.16b
546*4882a593Smuzhiyun	eor		XL.16b, XL.16b, KS0.16b
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun	.if		\enc == 1
549*4882a593Smuzhiyun	st1		{XL.16b}, [x10]			// store tag
550*4882a593Smuzhiyun	.else
551*4882a593Smuzhiyun	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
552*4882a593Smuzhiyun	adr_l		x17, .Lpermute_table
553*4882a593Smuzhiyun	ld1		{KS0.16b}, [x11]		// load supplied tag
554*4882a593Smuzhiyun	add		x17, x17, x12
555*4882a593Smuzhiyun	ld1		{KS1.16b}, [x17]		// load permute vector
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
558*4882a593Smuzhiyun	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
559*4882a593Smuzhiyun	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
560*4882a593Smuzhiyun	sminv		b0, XL.16b			// signed minimum across XL
561*4882a593Smuzhiyun	smov		w0, v0.b[0]			// return b0
562*4882a593Smuzhiyun	.endif
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun4:	ldp		x29, x30, [sp], #32
565*4882a593Smuzhiyun	ret
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun5:
568*4882a593SmuzhiyunCPU_LE(	rev		w8, w8		)
569*4882a593Smuzhiyun	str		w8, [x5, #12]			// store lower counter
570*4882a593Smuzhiyun	st1		{XL.2d}, [x4]
571*4882a593Smuzhiyun	b		4b
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
574*4882a593Smuzhiyun	sub		x17, x17, x19, lsl #1
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun	cmp		w9, #1
577*4882a593Smuzhiyun	beq		7f
578*4882a593Smuzhiyun	.subsection	1
579*4882a593Smuzhiyun7:	ld1		{INP2.16b}, [x1]
580*4882a593Smuzhiyun	tbx		INP2.16b, {INP3.16b}, T1.16b
581*4882a593Smuzhiyun	mov		INP3.16b, INP2.16b
582*4882a593Smuzhiyun	b		8f
583*4882a593Smuzhiyun	.previous
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun	st1		{INP0.16b}, [x1], x14
586*4882a593Smuzhiyun	st1		{INP1.16b}, [x1], x15
587*4882a593Smuzhiyun	st1		{INP2.16b}, [x1], x16
588*4882a593Smuzhiyun	tbl		INP3.16b, {INP3.16b}, T1.16b
589*4882a593Smuzhiyun	tbx		INP3.16b, {INP2.16b}, T2.16b
590*4882a593Smuzhiyun8:	st1		{INP3.16b}, [x1]
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun	.if		\enc == 1
593*4882a593Smuzhiyun	ld1		{T1.16b}, [x17]
594*4882a593Smuzhiyun	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
595*4882a593Smuzhiyun	bl		pmull_gcm_ghash_4x
596*4882a593Smuzhiyun	.endif
597*4882a593Smuzhiyun	b		3b
598*4882a593Smuzhiyun	.endm
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun	/*
601*4882a593Smuzhiyun	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
602*4882a593Smuzhiyun	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
603*4882a593Smuzhiyun	 *			  int rounds, u8 tag)
604*4882a593Smuzhiyun	 */
605*4882a593SmuzhiyunSYM_FUNC_START(pmull_gcm_encrypt)
606*4882a593Smuzhiyun	pmull_gcm_do_crypt	1
607*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_encrypt)
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun	/*
610*4882a593Smuzhiyun	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
611*4882a593Smuzhiyun	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
612*4882a593Smuzhiyun	 *			  int rounds, u8 tag)
613*4882a593Smuzhiyun	 */
614*4882a593SmuzhiyunSYM_FUNC_START(pmull_gcm_decrypt)
615*4882a593Smuzhiyun	pmull_gcm_do_crypt	0
616*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_decrypt)
617*4882a593Smuzhiyun
618*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
619*4882a593Smuzhiyun	movi		MASK.16b, #0xe1
620*4882a593Smuzhiyun	shl		MASK.2d, MASK.2d, #57
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun	rev64		T1.16b, INP0.16b
623*4882a593Smuzhiyun	rev64		T2.16b, INP1.16b
624*4882a593Smuzhiyun	rev64		TT3.16b, INP2.16b
625*4882a593Smuzhiyun	rev64		TT4.16b, INP3.16b
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun	ext		XL.16b, XL.16b, XL.16b, #8
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun	tbz		w9, #2, 0f			// <4 blocks?
630*4882a593Smuzhiyun	.subsection	1
631*4882a593Smuzhiyun0:	movi		XH2.16b, #0
632*4882a593Smuzhiyun	movi		XM2.16b, #0
633*4882a593Smuzhiyun	movi		XL2.16b, #0
634*4882a593Smuzhiyun
635*4882a593Smuzhiyun	tbz		w9, #0, 1f			// 2 blocks?
636*4882a593Smuzhiyun	tbz		w9, #1, 2f			// 1 block?
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun	eor		T2.16b, T2.16b, XL.16b
639*4882a593Smuzhiyun	ext		T1.16b, T2.16b, T2.16b, #8
640*4882a593Smuzhiyun	b		.Lgh3
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun1:	eor		TT3.16b, TT3.16b, XL.16b
643*4882a593Smuzhiyun	ext		T2.16b, TT3.16b, TT3.16b, #8
644*4882a593Smuzhiyun	b		.Lgh2
645*4882a593Smuzhiyun
646*4882a593Smuzhiyun2:	eor		TT4.16b, TT4.16b, XL.16b
647*4882a593Smuzhiyun	ext		IN1.16b, TT4.16b, TT4.16b, #8
648*4882a593Smuzhiyun	b		.Lgh1
649*4882a593Smuzhiyun	.previous
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun	eor		T1.16b, T1.16b, XL.16b
652*4882a593Smuzhiyun	ext		IN1.16b, T1.16b, T1.16b, #8
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
655*4882a593Smuzhiyun	eor		T1.16b, T1.16b, IN1.16b
656*4882a593Smuzhiyun	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
657*4882a593Smuzhiyun	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun	ext		T1.16b, T2.16b, T2.16b, #8
660*4882a593Smuzhiyun.Lgh3:	eor		T2.16b, T2.16b, T1.16b
661*4882a593Smuzhiyun	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
662*4882a593Smuzhiyun	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
663*4882a593Smuzhiyun	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun	eor		XH2.16b, XH2.16b, XH.16b
666*4882a593Smuzhiyun	eor		XL2.16b, XL2.16b, XL.16b
667*4882a593Smuzhiyun	eor		XM2.16b, XM2.16b, XM.16b
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun	ext		T2.16b, TT3.16b, TT3.16b, #8
670*4882a593Smuzhiyun.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
671*4882a593Smuzhiyun	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
672*4882a593Smuzhiyun	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
673*4882a593Smuzhiyun	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun	eor		XH2.16b, XH2.16b, XH.16b
676*4882a593Smuzhiyun	eor		XL2.16b, XL2.16b, XL.16b
677*4882a593Smuzhiyun	eor		XM2.16b, XM2.16b, XM.16b
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun	ext		IN1.16b, TT4.16b, TT4.16b, #8
680*4882a593Smuzhiyun.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
681*4882a593Smuzhiyun	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
682*4882a593Smuzhiyun	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
683*4882a593Smuzhiyun	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun	eor		XH.16b, XH.16b, XH2.16b
686*4882a593Smuzhiyun	eor		XL.16b, XL.16b, XL2.16b
687*4882a593Smuzhiyun	eor		XM.16b, XM.16b, XM2.16b
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun	eor		T2.16b, XL.16b, XH.16b
690*4882a593Smuzhiyun	ext		T1.16b, XL.16b, XH.16b, #8
691*4882a593Smuzhiyun	eor		XM.16b, XM.16b, T2.16b
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun	__pmull_reduce_p64
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun	eor		T2.16b, T2.16b, XH.16b
696*4882a593Smuzhiyun	eor		XL.16b, XL.16b, T2.16b
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun	ret
699*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_ghash_4x)
700*4882a593Smuzhiyun
701*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
702*4882a593Smuzhiyun	ld1		{KS0.16b}, [x5]			// load upper counter
703*4882a593Smuzhiyun	sub		w10, w8, #4
704*4882a593Smuzhiyun	sub		w11, w8, #3
705*4882a593Smuzhiyun	sub		w12, w8, #2
706*4882a593Smuzhiyun	sub		w13, w8, #1
707*4882a593Smuzhiyun	rev		w10, w10
708*4882a593Smuzhiyun	rev		w11, w11
709*4882a593Smuzhiyun	rev		w12, w12
710*4882a593Smuzhiyun	rev		w13, w13
711*4882a593Smuzhiyun	mov		KS1.16b, KS0.16b
712*4882a593Smuzhiyun	mov		KS2.16b, KS0.16b
713*4882a593Smuzhiyun	mov		KS3.16b, KS0.16b
714*4882a593Smuzhiyun	ins		KS0.s[3], w10			// set lower counter
715*4882a593Smuzhiyun	ins		KS1.s[3], w11
716*4882a593Smuzhiyun	ins		KS2.s[3], w12
717*4882a593Smuzhiyun	ins		KS3.s[3], w13
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun	add		x10, x6, #96			// round key pointer
720*4882a593Smuzhiyun	ld1		{K6.4s-K7.4s}, [x10], #32
721*4882a593Smuzhiyun	.irp		key, K0, K1, K2, K3, K4, K5
722*4882a593Smuzhiyun	enc_qround	KS0, KS1, KS2, KS3, \key
723*4882a593Smuzhiyun	.endr
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun	tbnz		x7, #2, .Lnot128
726*4882a593Smuzhiyun	.subsection	1
727*4882a593Smuzhiyun.Lnot128:
728*4882a593Smuzhiyun	ld1		{K8.4s-K9.4s}, [x10], #32
729*4882a593Smuzhiyun	.irp		key, K6, K7
730*4882a593Smuzhiyun	enc_qround	KS0, KS1, KS2, KS3, \key
731*4882a593Smuzhiyun	.endr
732*4882a593Smuzhiyun	ld1		{K6.4s-K7.4s}, [x10]
733*4882a593Smuzhiyun	.irp		key, K8, K9
734*4882a593Smuzhiyun	enc_qround	KS0, KS1, KS2, KS3, \key
735*4882a593Smuzhiyun	.endr
736*4882a593Smuzhiyun	tbz		x7, #1, .Lout192
737*4882a593Smuzhiyun	b		.Lout256
738*4882a593Smuzhiyun	.previous
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun.Lout256:
741*4882a593Smuzhiyun	.irp		key, K6, K7
742*4882a593Smuzhiyun	enc_qround	KS0, KS1, KS2, KS3, \key
743*4882a593Smuzhiyun	.endr
744*4882a593Smuzhiyun
745*4882a593Smuzhiyun.Lout192:
746*4882a593Smuzhiyun	enc_qround	KS0, KS1, KS2, KS3, KK
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun	aese		KS0.16b, KL.16b
749*4882a593Smuzhiyun	aese		KS1.16b, KL.16b
750*4882a593Smuzhiyun	aese		KS2.16b, KL.16b
751*4882a593Smuzhiyun	aese		KS3.16b, KL.16b
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun	eor		KS0.16b, KS0.16b, KM.16b
754*4882a593Smuzhiyun	eor		KS1.16b, KS1.16b, KM.16b
755*4882a593Smuzhiyun	eor		KS2.16b, KS2.16b, KM.16b
756*4882a593Smuzhiyun	eor		KS3.16b, KS3.16b, KM.16b
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun	eor		INP0.16b, INP0.16b, KS0.16b
759*4882a593Smuzhiyun	eor		INP1.16b, INP1.16b, KS1.16b
760*4882a593Smuzhiyun	eor		INP2.16b, INP2.16b, KS2.16b
761*4882a593Smuzhiyun	eor		INP3.16b, INP3.16b, KS3.16b
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun	ret
764*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_enc_4x)
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun	.section	".rodata", "a"
767*4882a593Smuzhiyun	.align		6
768*4882a593Smuzhiyun.Lpermute_table:
769*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
771*4882a593Smuzhiyun	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
772*4882a593Smuzhiyun	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
773*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774*4882a593Smuzhiyun	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
775*4882a593Smuzhiyun	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
776*4882a593Smuzhiyun	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
777*4882a593Smuzhiyun	.previous
778