xref: /OK3568_Linux_fs/kernel/arch/arm64/crypto/crct10dif-ce-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun//
2*4882a593Smuzhiyun// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3*4882a593Smuzhiyun//
4*4882a593Smuzhiyun// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5*4882a593Smuzhiyun// Copyright (C) 2019 Google LLC <ebiggers@google.com>
6*4882a593Smuzhiyun//
7*4882a593Smuzhiyun// This program is free software; you can redistribute it and/or modify
8*4882a593Smuzhiyun// it under the terms of the GNU General Public License version 2 as
9*4882a593Smuzhiyun// published by the Free Software Foundation.
10*4882a593Smuzhiyun//
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun// Derived from the x86 version:
13*4882a593Smuzhiyun//
14*4882a593Smuzhiyun// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
15*4882a593Smuzhiyun//
16*4882a593Smuzhiyun// Copyright (c) 2013, Intel Corporation
17*4882a593Smuzhiyun//
18*4882a593Smuzhiyun// Authors:
19*4882a593Smuzhiyun//     Erdinc Ozturk <erdinc.ozturk@intel.com>
20*4882a593Smuzhiyun//     Vinodh Gopal <vinodh.gopal@intel.com>
21*4882a593Smuzhiyun//     James Guilford <james.guilford@intel.com>
22*4882a593Smuzhiyun//     Tim Chen <tim.c.chen@linux.intel.com>
23*4882a593Smuzhiyun//
24*4882a593Smuzhiyun// This software is available to you under a choice of one of two
25*4882a593Smuzhiyun// licenses.  You may choose to be licensed under the terms of the GNU
26*4882a593Smuzhiyun// General Public License (GPL) Version 2, available from the file
27*4882a593Smuzhiyun// COPYING in the main directory of this source tree, or the
28*4882a593Smuzhiyun// OpenIB.org BSD license below:
29*4882a593Smuzhiyun//
30*4882a593Smuzhiyun// Redistribution and use in source and binary forms, with or without
31*4882a593Smuzhiyun// modification, are permitted provided that the following conditions are
32*4882a593Smuzhiyun// met:
33*4882a593Smuzhiyun//
34*4882a593Smuzhiyun// * Redistributions of source code must retain the above copyright
35*4882a593Smuzhiyun//   notice, this list of conditions and the following disclaimer.
36*4882a593Smuzhiyun//
37*4882a593Smuzhiyun// * Redistributions in binary form must reproduce the above copyright
38*4882a593Smuzhiyun//   notice, this list of conditions and the following disclaimer in the
39*4882a593Smuzhiyun//   documentation and/or other materials provided with the
40*4882a593Smuzhiyun//   distribution.
41*4882a593Smuzhiyun//
42*4882a593Smuzhiyun// * Neither the name of the Intel Corporation nor the names of its
43*4882a593Smuzhiyun//   contributors may be used to endorse or promote products derived from
44*4882a593Smuzhiyun//   this software without specific prior written permission.
45*4882a593Smuzhiyun//
46*4882a593Smuzhiyun//
47*4882a593Smuzhiyun// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
48*4882a593Smuzhiyun// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49*4882a593Smuzhiyun// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
50*4882a593Smuzhiyun// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
51*4882a593Smuzhiyun// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
52*4882a593Smuzhiyun// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
53*4882a593Smuzhiyun// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
54*4882a593Smuzhiyun// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
55*4882a593Smuzhiyun// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
56*4882a593Smuzhiyun// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
57*4882a593Smuzhiyun// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58*4882a593Smuzhiyun//
59*4882a593Smuzhiyun//       Reference paper titled "Fast CRC Computation for Generic
60*4882a593Smuzhiyun//	Polynomials Using PCLMULQDQ Instruction"
61*4882a593Smuzhiyun//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
62*4882a593Smuzhiyun//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
63*4882a593Smuzhiyun//
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun#include <linux/linkage.h>
66*4882a593Smuzhiyun#include <asm/assembler.h>
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun	.text
69*4882a593Smuzhiyun	.arch		armv8-a+crypto
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun	init_crc	.req	w0
72*4882a593Smuzhiyun	buf		.req	x1
73*4882a593Smuzhiyun	len		.req	x2
74*4882a593Smuzhiyun	fold_consts_ptr	.req	x3
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun	fold_consts	.req	v10
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun	ad		.req	v14
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	k00_16		.req	v15
81*4882a593Smuzhiyun	k32_48		.req	v16
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	t3		.req	v17
84*4882a593Smuzhiyun	t4		.req	v18
85*4882a593Smuzhiyun	t5		.req	v19
86*4882a593Smuzhiyun	t6		.req	v20
87*4882a593Smuzhiyun	t7		.req	v21
88*4882a593Smuzhiyun	t8		.req	v22
89*4882a593Smuzhiyun	t9		.req	v23
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun	perm1		.req	v24
92*4882a593Smuzhiyun	perm2		.req	v25
93*4882a593Smuzhiyun	perm3		.req	v26
94*4882a593Smuzhiyun	perm4		.req	v27
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	bd1		.req	v28
97*4882a593Smuzhiyun	bd2		.req	v29
98*4882a593Smuzhiyun	bd3		.req	v30
99*4882a593Smuzhiyun	bd4		.req	v31
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun	.macro		__pmull_init_p64
102*4882a593Smuzhiyun	.endm
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun	.macro		__pmull_pre_p64, bd
105*4882a593Smuzhiyun	.endm
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun	.macro		__pmull_init_p8
108*4882a593Smuzhiyun	// k00_16 := 0x0000000000000000_000000000000ffff
109*4882a593Smuzhiyun	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
110*4882a593Smuzhiyun	movi		k32_48.2d, #0xffffffff
111*4882a593Smuzhiyun	mov		k32_48.h[2], k32_48.h[0]
112*4882a593Smuzhiyun	ushr		k00_16.2d, k32_48.2d, #32
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun	// prepare the permutation vectors
115*4882a593Smuzhiyun	mov_q		x5, 0x080f0e0d0c0b0a09
116*4882a593Smuzhiyun	movi		perm4.8b, #8
117*4882a593Smuzhiyun	dup		perm1.2d, x5
118*4882a593Smuzhiyun	eor		perm1.16b, perm1.16b, perm4.16b
119*4882a593Smuzhiyun	ushr		perm2.2d, perm1.2d, #8
120*4882a593Smuzhiyun	ushr		perm3.2d, perm1.2d, #16
121*4882a593Smuzhiyun	ushr		perm4.2d, perm1.2d, #24
122*4882a593Smuzhiyun	sli		perm2.2d, perm1.2d, #56
123*4882a593Smuzhiyun	sli		perm3.2d, perm1.2d, #48
124*4882a593Smuzhiyun	sli		perm4.2d, perm1.2d, #40
125*4882a593Smuzhiyun	.endm
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun	.macro		__pmull_pre_p8, bd
128*4882a593Smuzhiyun	tbl		bd1.16b, {\bd\().16b}, perm1.16b
129*4882a593Smuzhiyun	tbl		bd2.16b, {\bd\().16b}, perm2.16b
130*4882a593Smuzhiyun	tbl		bd3.16b, {\bd\().16b}, perm3.16b
131*4882a593Smuzhiyun	tbl		bd4.16b, {\bd\().16b}, perm4.16b
132*4882a593Smuzhiyun	.endm
133*4882a593Smuzhiyun
134*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__pmull_p8_core)
135*4882a593Smuzhiyun.L__pmull_p8_core:
136*4882a593Smuzhiyun	ext		t4.8b, ad.8b, ad.8b, #1			// A1
137*4882a593Smuzhiyun	ext		t5.8b, ad.8b, ad.8b, #2			// A2
138*4882a593Smuzhiyun	ext		t6.8b, ad.8b, ad.8b, #3			// A3
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
141*4882a593Smuzhiyun	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
142*4882a593Smuzhiyun	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
143*4882a593Smuzhiyun	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
144*4882a593Smuzhiyun	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
145*4882a593Smuzhiyun	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
146*4882a593Smuzhiyun	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
147*4882a593Smuzhiyun	b		0f
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun.L__pmull_p8_core2:
150*4882a593Smuzhiyun	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
151*4882a593Smuzhiyun	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
152*4882a593Smuzhiyun	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
155*4882a593Smuzhiyun	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
156*4882a593Smuzhiyun	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
157*4882a593Smuzhiyun	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
158*4882a593Smuzhiyun	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
159*4882a593Smuzhiyun	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
160*4882a593Smuzhiyun	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
163*4882a593Smuzhiyun	eor		t5.16b, t5.16b, t7.16b			// M = G + H
164*4882a593Smuzhiyun	eor		t6.16b, t6.16b, t9.16b			// N = I + J
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun	uzp1		t8.2d, t4.2d, t5.2d
167*4882a593Smuzhiyun	uzp2		t4.2d, t4.2d, t5.2d
168*4882a593Smuzhiyun	uzp1		t7.2d, t6.2d, t3.2d
169*4882a593Smuzhiyun	uzp2		t6.2d, t6.2d, t3.2d
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun	// t4 = (L) (P0 + P1) << 8
172*4882a593Smuzhiyun	// t5 = (M) (P2 + P3) << 16
173*4882a593Smuzhiyun	eor		t8.16b, t8.16b, t4.16b
174*4882a593Smuzhiyun	and		t4.16b, t4.16b, k32_48.16b
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun	// t6 = (N) (P4 + P5) << 24
177*4882a593Smuzhiyun	// t7 = (K) (P6 + P7) << 32
178*4882a593Smuzhiyun	eor		t7.16b, t7.16b, t6.16b
179*4882a593Smuzhiyun	and		t6.16b, t6.16b, k00_16.16b
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun	eor		t8.16b, t8.16b, t4.16b
182*4882a593Smuzhiyun	eor		t7.16b, t7.16b, t6.16b
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun	zip2		t5.2d, t8.2d, t4.2d
185*4882a593Smuzhiyun	zip1		t4.2d, t8.2d, t4.2d
186*4882a593Smuzhiyun	zip2		t3.2d, t7.2d, t6.2d
187*4882a593Smuzhiyun	zip1		t6.2d, t7.2d, t6.2d
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun	ext		t4.16b, t4.16b, t4.16b, #15
190*4882a593Smuzhiyun	ext		t5.16b, t5.16b, t5.16b, #14
191*4882a593Smuzhiyun	ext		t6.16b, t6.16b, t6.16b, #13
192*4882a593Smuzhiyun	ext		t3.16b, t3.16b, t3.16b, #12
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun	eor		t4.16b, t4.16b, t5.16b
195*4882a593Smuzhiyun	eor		t6.16b, t6.16b, t3.16b
196*4882a593Smuzhiyun	ret
197*4882a593SmuzhiyunSYM_FUNC_END(__pmull_p8_core)
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun	.macro		__pmull_p8, rq, ad, bd, i
200*4882a593Smuzhiyun	.ifnc		\bd, fold_consts
201*4882a593Smuzhiyun	.err
202*4882a593Smuzhiyun	.endif
203*4882a593Smuzhiyun	mov		ad.16b, \ad\().16b
204*4882a593Smuzhiyun	.ifb		\i
205*4882a593Smuzhiyun	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
206*4882a593Smuzhiyun	.else
207*4882a593Smuzhiyun	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
208*4882a593Smuzhiyun	.endif
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun	bl		.L__pmull_p8_core\i
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun	eor		\rq\().16b, \rq\().16b, t4.16b
213*4882a593Smuzhiyun	eor		\rq\().16b, \rq\().16b, t6.16b
214*4882a593Smuzhiyun	.endm
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
217*4882a593Smuzhiyun	// into reg1, reg2.
218*4882a593Smuzhiyun	.macro		fold_32_bytes, p, reg1, reg2
219*4882a593Smuzhiyun	ldp		q11, q12, [buf], #0x20
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	__pmull_\p	v8, \reg1, fold_consts, 2
222*4882a593Smuzhiyun	__pmull_\p	\reg1, \reg1, fold_consts
223*4882a593Smuzhiyun
224*4882a593SmuzhiyunCPU_LE(	rev64		v11.16b, v11.16b		)
225*4882a593SmuzhiyunCPU_LE(	rev64		v12.16b, v12.16b		)
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun	__pmull_\p	v9, \reg2, fold_consts, 2
228*4882a593Smuzhiyun	__pmull_\p	\reg2, \reg2, fold_consts
229*4882a593Smuzhiyun
230*4882a593SmuzhiyunCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
231*4882a593SmuzhiyunCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun	eor		\reg1\().16b, \reg1\().16b, v8.16b
234*4882a593Smuzhiyun	eor		\reg2\().16b, \reg2\().16b, v9.16b
235*4882a593Smuzhiyun	eor		\reg1\().16b, \reg1\().16b, v11.16b
236*4882a593Smuzhiyun	eor		\reg2\().16b, \reg2\().16b, v12.16b
237*4882a593Smuzhiyun	.endm
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun	// Fold src_reg into dst_reg, optionally loading the next fold constants
240*4882a593Smuzhiyun	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
241*4882a593Smuzhiyun	__pmull_\p	v8, \src_reg, fold_consts
242*4882a593Smuzhiyun	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
243*4882a593Smuzhiyun	.ifnb		\load_next_consts
244*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
245*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
246*4882a593Smuzhiyun	.endif
247*4882a593Smuzhiyun	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
248*4882a593Smuzhiyun	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
249*4882a593Smuzhiyun	.endm
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun	.macro		__pmull_p64, rd, rn, rm, n
252*4882a593Smuzhiyun	.ifb		\n
253*4882a593Smuzhiyun	pmull		\rd\().1q, \rn\().1d, \rm\().1d
254*4882a593Smuzhiyun	.else
255*4882a593Smuzhiyun	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
256*4882a593Smuzhiyun	.endif
257*4882a593Smuzhiyun	.endm
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun	.macro		crc_t10dif_pmull, p
260*4882a593Smuzhiyun	__pmull_init_\p
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
263*4882a593Smuzhiyun	cmp		len, #256
264*4882a593Smuzhiyun	b.lt		.Lless_than_256_bytes_\@
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun	// Load the first 128 data bytes.  Byte swapping is necessary to make
269*4882a593Smuzhiyun	// the bit order match the polynomial coefficient order.
270*4882a593Smuzhiyun	ldp		q0, q1, [buf]
271*4882a593Smuzhiyun	ldp		q2, q3, [buf, #0x20]
272*4882a593Smuzhiyun	ldp		q4, q5, [buf, #0x40]
273*4882a593Smuzhiyun	ldp		q6, q7, [buf, #0x60]
274*4882a593Smuzhiyun	add		buf, buf, #0x80
275*4882a593SmuzhiyunCPU_LE(	rev64		v0.16b, v0.16b			)
276*4882a593SmuzhiyunCPU_LE(	rev64		v1.16b, v1.16b			)
277*4882a593SmuzhiyunCPU_LE(	rev64		v2.16b, v2.16b			)
278*4882a593SmuzhiyunCPU_LE(	rev64		v3.16b, v3.16b			)
279*4882a593SmuzhiyunCPU_LE(	rev64		v4.16b, v4.16b			)
280*4882a593SmuzhiyunCPU_LE(	rev64		v5.16b, v5.16b			)
281*4882a593SmuzhiyunCPU_LE(	rev64		v6.16b, v6.16b			)
282*4882a593SmuzhiyunCPU_LE(	rev64		v7.16b, v7.16b			)
283*4882a593SmuzhiyunCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
284*4882a593SmuzhiyunCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
285*4882a593SmuzhiyunCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
286*4882a593SmuzhiyunCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
287*4882a593SmuzhiyunCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
288*4882a593SmuzhiyunCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
289*4882a593SmuzhiyunCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
290*4882a593SmuzhiyunCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun	// XOR the first 16 data *bits* with the initial CRC value.
293*4882a593Smuzhiyun	movi		v8.16b, #0
294*4882a593Smuzhiyun	mov		v8.h[7], init_crc
295*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v8.16b
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun	// Load the constants for folding across 128 bytes.
298*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr]
299*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
302*4882a593Smuzhiyun	// 128 to simplify the termination condition of the following loop.
303*4882a593Smuzhiyun	sub		len, len, #256
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
306*4882a593Smuzhiyun	// bytes v0-v7 into them, storing the result back into v0-v7.
307*4882a593Smuzhiyun.Lfold_128_bytes_loop_\@:
308*4882a593Smuzhiyun	fold_32_bytes	\p, v0, v1
309*4882a593Smuzhiyun	fold_32_bytes	\p, v2, v3
310*4882a593Smuzhiyun	fold_32_bytes	\p, v4, v5
311*4882a593Smuzhiyun	fold_32_bytes	\p, v6, v7
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun	subs		len, len, #128
314*4882a593Smuzhiyun	b.ge		.Lfold_128_bytes_loop_\@
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun	// Fold across 64 bytes.
319*4882a593Smuzhiyun	add		fold_consts_ptr, fold_consts_ptr, #16
320*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
321*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
322*4882a593Smuzhiyun	fold_16_bytes	\p, v0, v4
323*4882a593Smuzhiyun	fold_16_bytes	\p, v1, v5
324*4882a593Smuzhiyun	fold_16_bytes	\p, v2, v6
325*4882a593Smuzhiyun	fold_16_bytes	\p, v3, v7, 1
326*4882a593Smuzhiyun	// Fold across 32 bytes.
327*4882a593Smuzhiyun	fold_16_bytes	\p, v4, v6
328*4882a593Smuzhiyun	fold_16_bytes	\p, v5, v7, 1
329*4882a593Smuzhiyun	// Fold across 16 bytes.
330*4882a593Smuzhiyun	fold_16_bytes	\p, v6, v7
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun	// Add 128 to get the correct number of data bytes remaining in 0...127
333*4882a593Smuzhiyun	// (not counting v7), following the previous extra subtraction by 128.
334*4882a593Smuzhiyun	// Then subtract 16 to simplify the termination condition of the
335*4882a593Smuzhiyun	// following loop.
336*4882a593Smuzhiyun	adds		len, len, #(128-16)
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
339*4882a593Smuzhiyun	// into them, storing the result back into v7.
340*4882a593Smuzhiyun	b.lt		.Lfold_16_bytes_loop_done_\@
341*4882a593Smuzhiyun.Lfold_16_bytes_loop_\@:
342*4882a593Smuzhiyun	__pmull_\p	v8, v7, fold_consts
343*4882a593Smuzhiyun	__pmull_\p	v7, v7, fold_consts, 2
344*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v8.16b
345*4882a593Smuzhiyun	ldr		q0, [buf], #16
346*4882a593SmuzhiyunCPU_LE(	rev64		v0.16b, v0.16b			)
347*4882a593SmuzhiyunCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
348*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v0.16b
349*4882a593Smuzhiyun	subs		len, len, #16
350*4882a593Smuzhiyun	b.ge		.Lfold_16_bytes_loop_\@
351*4882a593Smuzhiyun
352*4882a593Smuzhiyun.Lfold_16_bytes_loop_done_\@:
353*4882a593Smuzhiyun	// Add 16 to get the correct number of data bytes remaining in 0...15
354*4882a593Smuzhiyun	// (not counting v7), following the previous extra subtraction by 16.
355*4882a593Smuzhiyun	adds		len, len, #16
356*4882a593Smuzhiyun	b.eq		.Lreduce_final_16_bytes_\@
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun.Lhandle_partial_segment_\@:
359*4882a593Smuzhiyun	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
360*4882a593Smuzhiyun	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
361*4882a593Smuzhiyun	// do this without needing a fold constant for each possible 'len',
362*4882a593Smuzhiyun	// redivide the bytes into a first chunk of 'len' bytes and a second
363*4882a593Smuzhiyun	// chunk of 16 bytes, then fold the first chunk into the second.
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun	// v0 = last 16 original data bytes
366*4882a593Smuzhiyun	add		buf, buf, len
367*4882a593Smuzhiyun	ldr		q0, [buf, #-16]
368*4882a593SmuzhiyunCPU_LE(	rev64		v0.16b, v0.16b			)
369*4882a593SmuzhiyunCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
372*4882a593Smuzhiyun	adr_l		x4, .Lbyteshift_table + 16
373*4882a593Smuzhiyun	sub		x4, x4, len
374*4882a593Smuzhiyun	ld1		{v2.16b}, [x4]
375*4882a593Smuzhiyun	tbl		v1.16b, {v7.16b}, v2.16b
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
378*4882a593Smuzhiyun	movi		v3.16b, #0x80
379*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v3.16b
380*4882a593Smuzhiyun	tbl		v3.16b, {v7.16b}, v2.16b
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
383*4882a593Smuzhiyun	sshr		v2.16b, v2.16b, #7
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
386*4882a593Smuzhiyun	// then '16-len' bytes from v1 (high-order bytes).
387*4882a593Smuzhiyun	bsl		v2.16b, v1.16b, v0.16b
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun	// Fold the first chunk into the second chunk, storing the result in v7.
390*4882a593Smuzhiyun	__pmull_\p	v0, v3, fold_consts
391*4882a593Smuzhiyun	__pmull_\p	v7, v3, fold_consts, 2
392*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v0.16b
393*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v2.16b
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun.Lreduce_final_16_bytes_\@:
396*4882a593Smuzhiyun	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun	movi		v2.16b, #0		// init zero register
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
401*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
402*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun	// Fold the high 64 bits into the low 64 bits, while also multiplying by
405*4882a593Smuzhiyun	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
406*4882a593Smuzhiyun	// whose low 48 bits are 0.
407*4882a593Smuzhiyun	ext		v0.16b, v2.16b, v7.16b, #8
408*4882a593Smuzhiyun	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
409*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
412*4882a593Smuzhiyun	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
413*4882a593Smuzhiyun	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
414*4882a593Smuzhiyun	mov		v0.s[3], v2.s[0]	// zero high 32 bits
415*4882a593Smuzhiyun	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
416*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v1.16b	// + low bits
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun	// Load G(x) and floor(x^48 / G(x)).
419*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr]
420*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun	// Use Barrett reduction to compute the final CRC value.
423*4882a593Smuzhiyun	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
424*4882a593Smuzhiyun	ushr		v1.2d, v1.2d, #32	// /= x^32
425*4882a593Smuzhiyun	__pmull_\p	v1, v1, fold_consts	// *= G(x)
426*4882a593Smuzhiyun	ushr		v0.2d, v0.2d, #48
427*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
428*4882a593Smuzhiyun	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun	umov		w0, v0.h[0]
431*4882a593Smuzhiyun	.ifc		\p, p8
432*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
433*4882a593Smuzhiyun	.endif
434*4882a593Smuzhiyun	ret
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun.Lless_than_256_bytes_\@:
437*4882a593Smuzhiyun	// Checksumming a buffer of length 16...255 bytes
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun	// Load the first 16 data bytes.
442*4882a593Smuzhiyun	ldr		q7, [buf], #0x10
443*4882a593SmuzhiyunCPU_LE(	rev64		v7.16b, v7.16b			)
444*4882a593SmuzhiyunCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun	// XOR the first 16 data *bits* with the initial CRC value.
447*4882a593Smuzhiyun	movi		v0.16b, #0
448*4882a593Smuzhiyun	mov		v0.h[7], init_crc
449*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v0.16b
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun	// Load the fold-across-16-bytes constants.
452*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
453*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun	cmp		len, #16
456*4882a593Smuzhiyun	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
457*4882a593Smuzhiyun	subs		len, len, #32
458*4882a593Smuzhiyun	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
459*4882a593Smuzhiyun	add		len, len, #16
460*4882a593Smuzhiyun	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
461*4882a593Smuzhiyun	.endm
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun//
464*4882a593Smuzhiyun// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
465*4882a593Smuzhiyun//
466*4882a593Smuzhiyun// Assumes len >= 16.
467*4882a593Smuzhiyun//
468*4882a593SmuzhiyunSYM_FUNC_START(crc_t10dif_pmull_p8)
469*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
470*4882a593Smuzhiyun	mov		x29, sp
471*4882a593Smuzhiyun	crc_t10dif_pmull p8
472*4882a593SmuzhiyunSYM_FUNC_END(crc_t10dif_pmull_p8)
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun	.align		5
475*4882a593Smuzhiyun//
476*4882a593Smuzhiyun// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
477*4882a593Smuzhiyun//
478*4882a593Smuzhiyun// Assumes len >= 16.
479*4882a593Smuzhiyun//
480*4882a593SmuzhiyunSYM_FUNC_START(crc_t10dif_pmull_p64)
481*4882a593Smuzhiyun	crc_t10dif_pmull	p64
482*4882a593SmuzhiyunSYM_FUNC_END(crc_t10dif_pmull_p64)
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun	.section	".rodata", "a"
485*4882a593Smuzhiyun	.align		4
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun// Fold constants precomputed from the polynomial 0x18bb7
488*4882a593Smuzhiyun// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
489*4882a593Smuzhiyun.Lfold_across_128_bytes_consts:
490*4882a593Smuzhiyun	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
491*4882a593Smuzhiyun	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
492*4882a593Smuzhiyun// .Lfold_across_64_bytes_consts:
493*4882a593Smuzhiyun	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
494*4882a593Smuzhiyun	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
495*4882a593Smuzhiyun// .Lfold_across_32_bytes_consts:
496*4882a593Smuzhiyun	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
497*4882a593Smuzhiyun	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
498*4882a593Smuzhiyun.Lfold_across_16_bytes_consts:
499*4882a593Smuzhiyun	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
500*4882a593Smuzhiyun	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
501*4882a593Smuzhiyun// .Lfinal_fold_consts:
502*4882a593Smuzhiyun	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
503*4882a593Smuzhiyun	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
504*4882a593Smuzhiyun// .Lbarrett_reduction_consts:
505*4882a593Smuzhiyun	.quad		0x0000000000018bb7	// G(x)
506*4882a593Smuzhiyun	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
509*4882a593Smuzhiyun// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
510*4882a593Smuzhiyun// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
511*4882a593Smuzhiyun.Lbyteshift_table:
512*4882a593Smuzhiyun	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
513*4882a593Smuzhiyun	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
514*4882a593Smuzhiyun	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
515*4882a593Smuzhiyun	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
516