arm64/crypto/crct10dif-ce-core.S

*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
*4882a593Smuzhiyun// Copyright (C) 2019 Google LLC <ebiggers@google.com>
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// This program is free software; you can redistribute it and/or modify
*4882a593Smuzhiyun// it under the terms of the GNU General Public License version 2 as
*4882a593Smuzhiyun// published by the Free Software Foundation.
*4882a593Smuzhiyun//
*4882a593Smuzhiyun
*4882a593Smuzhiyun// Derived from the x86 version:
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Copyright (c) 2013, Intel Corporation
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Authors:
*4882a593Smuzhiyun//     Erdinc Ozturk <erdinc.ozturk@intel.com>
*4882a593Smuzhiyun//     Vinodh Gopal <vinodh.gopal@intel.com>
*4882a593Smuzhiyun//     James Guilford <james.guilford@intel.com>
*4882a593Smuzhiyun//     Tim Chen <tim.c.chen@linux.intel.com>
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// This software is available to you under a choice of one of two
*4882a593Smuzhiyun// licenses.  You may choose to be licensed under the terms of the GNU
*4882a593Smuzhiyun// General Public License (GPL) Version 2, available from the file
*4882a593Smuzhiyun// COPYING in the main directory of this source tree, or the
*4882a593Smuzhiyun// OpenIB.org BSD license below:
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Redistribution and use in source and binary forms, with or without
*4882a593Smuzhiyun// modification, are permitted provided that the following conditions are
*4882a593Smuzhiyun// met:
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// * Redistributions of source code must retain the above copyright
*4882a593Smuzhiyun//   notice, this list of conditions and the following disclaimer.
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// * Redistributions in binary form must reproduce the above copyright
*4882a593Smuzhiyun//   notice, this list of conditions and the following disclaimer in the
*4882a593Smuzhiyun//   documentation and/or other materials provided with the
*4882a593Smuzhiyun//   distribution.
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// * Neither the name of the Intel Corporation nor the names of its
*4882a593Smuzhiyun//   contributors may be used to endorse or promote products derived from
*4882a593Smuzhiyun//   this software without specific prior written permission.
*4882a593Smuzhiyun//
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
*4882a593Smuzhiyun// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*4882a593Smuzhiyun// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
*4882a593Smuzhiyun// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
*4882a593Smuzhiyun// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
*4882a593Smuzhiyun// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
*4882a593Smuzhiyun// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
*4882a593Smuzhiyun// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
*4882a593Smuzhiyun// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
*4882a593Smuzhiyun// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
*4882a593Smuzhiyun// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*4882a593Smuzhiyun//
*4882a593Smuzhiyun//       Reference paper titled "Fast CRC Computation for Generic
*4882a593Smuzhiyun//	Polynomials Using PCLMULQDQ Instruction"
*4882a593Smuzhiyun//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
*4882a593Smuzhiyun//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*4882a593Smuzhiyun//
*4882a593Smuzhiyun
*4882a593Smuzhiyun#include <linux/linkage.h>
*4882a593Smuzhiyun#include <asm/assembler.h>
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.text
*4882a593Smuzhiyun	.arch		armv8-a+crypto
*4882a593Smuzhiyun
*4882a593Smuzhiyun	init_crc	.req	w0
*4882a593Smuzhiyun	buf		.req	x1
*4882a593Smuzhiyun	len		.req	x2
*4882a593Smuzhiyun	fold_consts_ptr	.req	x3
*4882a593Smuzhiyun
*4882a593Smuzhiyun	fold_consts	.req	v10
*4882a593Smuzhiyun
*4882a593Smuzhiyun	ad		.req	v14
*4882a593Smuzhiyun
*4882a593Smuzhiyun	k00_16		.req	v15
*4882a593Smuzhiyun	k32_48		.req	v16
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t3		.req	v17
*4882a593Smuzhiyun	t4		.req	v18
*4882a593Smuzhiyun	t5		.req	v19
*4882a593Smuzhiyun	t6		.req	v20
*4882a593Smuzhiyun	t7		.req	v21
*4882a593Smuzhiyun	t8		.req	v22
*4882a593Smuzhiyun	t9		.req	v23
*4882a593Smuzhiyun
*4882a593Smuzhiyun	perm1		.req	v24
*4882a593Smuzhiyun	perm2		.req	v25
*4882a593Smuzhiyun	perm3		.req	v26
*4882a593Smuzhiyun	perm4		.req	v27
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bd1		.req	v28
*4882a593Smuzhiyun	bd2		.req	v29
*4882a593Smuzhiyun	bd3		.req	v30
*4882a593Smuzhiyun	bd4		.req	v31
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_init_p64
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_pre_p64, bd
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_init_p8
*4882a593Smuzhiyun	// k00_16 := 0x0000000000000000_000000000000ffff
*4882a593Smuzhiyun	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
*4882a593Smuzhiyun	movi		k32_48.2d, #0xffffffff
*4882a593Smuzhiyun	mov		k32_48.h[2], k32_48.h[0]
*4882a593Smuzhiyun	ushr		k00_16.2d, k32_48.2d, #32
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// prepare the permutation vectors
*4882a593Smuzhiyun	mov_q		x5, 0x080f0e0d0c0b0a09
*4882a593Smuzhiyun	movi		perm4.8b, #8
*4882a593Smuzhiyun	dup		perm1.2d, x5
*4882a593Smuzhiyun	eor		perm1.16b, perm1.16b, perm4.16b
*4882a593Smuzhiyun	ushr		perm2.2d, perm1.2d, #8
*4882a593Smuzhiyun	ushr		perm3.2d, perm1.2d, #16
*4882a593Smuzhiyun	ushr		perm4.2d, perm1.2d, #24
*4882a593Smuzhiyun	sli		perm2.2d, perm1.2d, #56
*4882a593Smuzhiyun	sli		perm3.2d, perm1.2d, #48
*4882a593Smuzhiyun	sli		perm4.2d, perm1.2d, #40
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_pre_p8, bd
*4882a593Smuzhiyun	tbl		bd1.16b, {\bd\().16b}, perm1.16b
*4882a593Smuzhiyun	tbl		bd2.16b, {\bd\().16b}, perm2.16b
*4882a593Smuzhiyun	tbl		bd3.16b, {\bd\().16b}, perm3.16b
*4882a593Smuzhiyun	tbl		bd4.16b, {\bd\().16b}, perm4.16b
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__pmull_p8_core)
*4882a593Smuzhiyun.L__pmull_p8_core:
*4882a593Smuzhiyun	ext		t4.8b, ad.8b, ad.8b, #1			// A1
*4882a593Smuzhiyun	ext		t5.8b, ad.8b, ad.8b, #2			// A2
*4882a593Smuzhiyun	ext		t6.8b, ad.8b, ad.8b, #3			// A3
*4882a593Smuzhiyun
*4882a593Smuzhiyun	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
*4882a593Smuzhiyun	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
*4882a593Smuzhiyun	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
*4882a593Smuzhiyun	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
*4882a593Smuzhiyun	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
*4882a593Smuzhiyun	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
*4882a593Smuzhiyun	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
*4882a593Smuzhiyun	b		0f
*4882a593Smuzhiyun
*4882a593Smuzhiyun.L__pmull_p8_core2:
*4882a593Smuzhiyun	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
*4882a593Smuzhiyun	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
*4882a593Smuzhiyun	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
*4882a593Smuzhiyun
*4882a593Smuzhiyun	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
*4882a593Smuzhiyun	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
*4882a593Smuzhiyun	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
*4882a593Smuzhiyun	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
*4882a593Smuzhiyun	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
*4882a593Smuzhiyun	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
*4882a593Smuzhiyun	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
*4882a593Smuzhiyun
*4882a593Smuzhiyun0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
*4882a593Smuzhiyun	eor		t5.16b, t5.16b, t7.16b			// M = G + H
*4882a593Smuzhiyun	eor		t6.16b, t6.16b, t9.16b			// N = I + J
*4882a593Smuzhiyun
*4882a593Smuzhiyun	uzp1		t8.2d, t4.2d, t5.2d
*4882a593Smuzhiyun	uzp2		t4.2d, t4.2d, t5.2d
*4882a593Smuzhiyun	uzp1		t7.2d, t6.2d, t3.2d
*4882a593Smuzhiyun	uzp2		t6.2d, t6.2d, t3.2d
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// t4 = (L) (P0 + P1) << 8
*4882a593Smuzhiyun	// t5 = (M) (P2 + P3) << 16
*4882a593Smuzhiyun	eor		t8.16b, t8.16b, t4.16b
*4882a593Smuzhiyun	and		t4.16b, t4.16b, k32_48.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// t6 = (N) (P4 + P5) << 24
*4882a593Smuzhiyun	// t7 = (K) (P6 + P7) << 32
*4882a593Smuzhiyun	eor		t7.16b, t7.16b, t6.16b
*4882a593Smuzhiyun	and		t6.16b, t6.16b, k00_16.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	eor		t8.16b, t8.16b, t4.16b
*4882a593Smuzhiyun	eor		t7.16b, t7.16b, t6.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	zip2		t5.2d, t8.2d, t4.2d
*4882a593Smuzhiyun	zip1		t4.2d, t8.2d, t4.2d
*4882a593Smuzhiyun	zip2		t3.2d, t7.2d, t6.2d
*4882a593Smuzhiyun	zip1		t6.2d, t7.2d, t6.2d
*4882a593Smuzhiyun
*4882a593Smuzhiyun	ext		t4.16b, t4.16b, t4.16b, #15
*4882a593Smuzhiyun	ext		t5.16b, t5.16b, t5.16b, #14
*4882a593Smuzhiyun	ext		t6.16b, t6.16b, t6.16b, #13
*4882a593Smuzhiyun	ext		t3.16b, t3.16b, t3.16b, #12
*4882a593Smuzhiyun
*4882a593Smuzhiyun	eor		t4.16b, t4.16b, t5.16b
*4882a593Smuzhiyun	eor		t6.16b, t6.16b, t3.16b
*4882a593Smuzhiyun	ret
*4882a593SmuzhiyunSYM_FUNC_END(__pmull_p8_core)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_p8, rq, ad, bd, i
*4882a593Smuzhiyun	.ifnc		\bd, fold_consts
*4882a593Smuzhiyun	.err
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	mov		ad.16b, \ad\().16b
*4882a593Smuzhiyun	.ifb		\i
*4882a593Smuzhiyun	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
*4882a593Smuzhiyun	.else
*4882a593Smuzhiyun	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bl		.L__pmull_p8_core\i
*4882a593Smuzhiyun
*4882a593Smuzhiyun	eor		\rq\().16b, \rq\().16b, t4.16b
*4882a593Smuzhiyun	eor		\rq\().16b, \rq\().16b, t6.16b
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
*4882a593Smuzhiyun	// into reg1, reg2.
*4882a593Smuzhiyun	.macro		fold_32_bytes, p, reg1, reg2
*4882a593Smuzhiyun	ldp		q11, q12, [buf], #0x20
*4882a593Smuzhiyun
*4882a593Smuzhiyun	__pmull_\p	v8, \reg1, fold_consts, 2
*4882a593Smuzhiyun	__pmull_\p	\reg1, \reg1, fold_consts
*4882a593Smuzhiyun
*4882a593SmuzhiyunCPU_LE(	rev64		v11.16b, v11.16b		)
*4882a593SmuzhiyunCPU_LE(	rev64		v12.16b, v12.16b		)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	__pmull_\p	v9, \reg2, fold_consts, 2
*4882a593Smuzhiyun	__pmull_\p	\reg2, \reg2, fold_consts
*4882a593Smuzhiyun
*4882a593SmuzhiyunCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	eor		\reg1\().16b, \reg1\().16b, v8.16b
*4882a593Smuzhiyun	eor		\reg2\().16b, \reg2\().16b, v9.16b
*4882a593Smuzhiyun	eor		\reg1\().16b, \reg1\().16b, v11.16b
*4882a593Smuzhiyun	eor		\reg2\().16b, \reg2\().16b, v12.16b
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Fold src_reg into dst_reg, optionally loading the next fold constants
*4882a593Smuzhiyun	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
*4882a593Smuzhiyun	__pmull_\p	v8, \src_reg, fold_consts
*4882a593Smuzhiyun	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
*4882a593Smuzhiyun	.ifnb		\load_next_consts
*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
*4882a593Smuzhiyun	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_p64, rd, rn, rm, n
*4882a593Smuzhiyun	.ifb		\n
*4882a593Smuzhiyun	pmull		\rd\().1q, \rn\().1d, \rm\().1d
*4882a593Smuzhiyun	.else
*4882a593Smuzhiyun	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		crc_t10dif_pmull, p
*4882a593Smuzhiyun	__pmull_init_\p
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
*4882a593Smuzhiyun	cmp		len, #256
*4882a593Smuzhiyun	b.lt		.Lless_than_256_bytes_\@
*4882a593Smuzhiyun
*4882a593Smuzhiyun	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Load the first 128 data bytes.  Byte swapping is necessary to make
*4882a593Smuzhiyun	// the bit order match the polynomial coefficient order.
*4882a593Smuzhiyun	ldp		q0, q1, [buf]
*4882a593Smuzhiyun	ldp		q2, q3, [buf, #0x20]
*4882a593Smuzhiyun	ldp		q4, q5, [buf, #0x40]
*4882a593Smuzhiyun	ldp		q6, q7, [buf, #0x60]
*4882a593Smuzhiyun	add		buf, buf, #0x80
*4882a593SmuzhiyunCPU_LE(	rev64		v0.16b, v0.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v1.16b, v1.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v2.16b, v2.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v3.16b, v3.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v4.16b, v4.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v5.16b, v5.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v6.16b, v6.16b			)
*4882a593SmuzhiyunCPU_LE(	rev64		v7.16b, v7.16b			)
*4882a593SmuzhiyunCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
*4882a593SmuzhiyunCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// XOR the first 16 data *bits* with the initial CRC value.
*4882a593Smuzhiyun	movi		v8.16b, #0
*4882a593Smuzhiyun	mov		v8.h[7], init_crc
*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v8.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Load the constants for folding across 128 bytes.
*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr]
*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
*4882a593Smuzhiyun	// 128 to simplify the termination condition of the following loop.
*4882a593Smuzhiyun	sub		len, len, #256
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
*4882a593Smuzhiyun	// bytes v0-v7 into them, storing the result back into v0-v7.
*4882a593Smuzhiyun.Lfold_128_bytes_loop_\@:
*4882a593Smuzhiyun	fold_32_bytes	\p, v0, v1
*4882a593Smuzhiyun	fold_32_bytes	\p, v2, v3
*4882a593Smuzhiyun	fold_32_bytes	\p, v4, v5
*4882a593Smuzhiyun	fold_32_bytes	\p, v6, v7
*4882a593Smuzhiyun
*4882a593Smuzhiyun	subs		len, len, #128
*4882a593Smuzhiyun	b.ge		.Lfold_128_bytes_loop_\@
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Fold across 64 bytes.
*4882a593Smuzhiyun	add		fold_consts_ptr, fold_consts_ptr, #16
*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
*4882a593Smuzhiyun	fold_16_bytes	\p, v0, v4
*4882a593Smuzhiyun	fold_16_bytes	\p, v1, v5
*4882a593Smuzhiyun	fold_16_bytes	\p, v2, v6
*4882a593Smuzhiyun	fold_16_bytes	\p, v3, v7, 1
*4882a593Smuzhiyun	// Fold across 32 bytes.
*4882a593Smuzhiyun	fold_16_bytes	\p, v4, v6
*4882a593Smuzhiyun	fold_16_bytes	\p, v5, v7, 1
*4882a593Smuzhiyun	// Fold across 16 bytes.
*4882a593Smuzhiyun	fold_16_bytes	\p, v6, v7
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Add 128 to get the correct number of data bytes remaining in 0...127
*4882a593Smuzhiyun	// (not counting v7), following the previous extra subtraction by 128.
*4882a593Smuzhiyun	// Then subtract 16 to simplify the termination condition of the
*4882a593Smuzhiyun	// following loop.
*4882a593Smuzhiyun	adds		len, len, #(128-16)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
*4882a593Smuzhiyun	// into them, storing the result back into v7.
*4882a593Smuzhiyun	b.lt		.Lfold_16_bytes_loop_done_\@
*4882a593Smuzhiyun.Lfold_16_bytes_loop_\@:
*4882a593Smuzhiyun	__pmull_\p	v8, v7, fold_consts
*4882a593Smuzhiyun	__pmull_\p	v7, v7, fold_consts, 2
*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v8.16b
*4882a593Smuzhiyun	ldr		q0, [buf], #16
*4882a593SmuzhiyunCPU_LE(	rev64		v0.16b, v0.16b			)
*4882a593SmuzhiyunCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v0.16b
*4882a593Smuzhiyun	subs		len, len, #16
*4882a593Smuzhiyun	b.ge		.Lfold_16_bytes_loop_\@
*4882a593Smuzhiyun
*4882a593Smuzhiyun.Lfold_16_bytes_loop_done_\@:
*4882a593Smuzhiyun	// Add 16 to get the correct number of data bytes remaining in 0...15
*4882a593Smuzhiyun	// (not counting v7), following the previous extra subtraction by 16.
*4882a593Smuzhiyun	adds		len, len, #16
*4882a593Smuzhiyun	b.eq		.Lreduce_final_16_bytes_\@
*4882a593Smuzhiyun
*4882a593Smuzhiyun.Lhandle_partial_segment_\@:
*4882a593Smuzhiyun	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
*4882a593Smuzhiyun	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
*4882a593Smuzhiyun	// do this without needing a fold constant for each possible 'len',
*4882a593Smuzhiyun	// redivide the bytes into a first chunk of 'len' bytes and a second
*4882a593Smuzhiyun	// chunk of 16 bytes, then fold the first chunk into the second.
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// v0 = last 16 original data bytes
*4882a593Smuzhiyun	add		buf, buf, len
*4882a593Smuzhiyun	ldr		q0, [buf, #-16]
*4882a593SmuzhiyunCPU_LE(	rev64		v0.16b, v0.16b			)
*4882a593SmuzhiyunCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
*4882a593Smuzhiyun	adr_l		x4, .Lbyteshift_table + 16
*4882a593Smuzhiyun	sub		x4, x4, len
*4882a593Smuzhiyun	ld1		{v2.16b}, [x4]
*4882a593Smuzhiyun	tbl		v1.16b, {v7.16b}, v2.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
*4882a593Smuzhiyun	movi		v3.16b, #0x80
*4882a593Smuzhiyun	eor		v2.16b, v2.16b, v3.16b
*4882a593Smuzhiyun	tbl		v3.16b, {v7.16b}, v2.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
*4882a593Smuzhiyun	sshr		v2.16b, v2.16b, #7
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
*4882a593Smuzhiyun	// then '16-len' bytes from v1 (high-order bytes).
*4882a593Smuzhiyun	bsl		v2.16b, v1.16b, v0.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Fold the first chunk into the second chunk, storing the result in v7.
*4882a593Smuzhiyun	__pmull_\p	v0, v3, fold_consts
*4882a593Smuzhiyun	__pmull_\p	v7, v3, fold_consts, 2
*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v0.16b
*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v2.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun.Lreduce_final_16_bytes_\@:
*4882a593Smuzhiyun	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
*4882a593Smuzhiyun
*4882a593Smuzhiyun	movi		v2.16b, #0		// init zero register
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Fold the high 64 bits into the low 64 bits, while also multiplying by
*4882a593Smuzhiyun	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
*4882a593Smuzhiyun	// whose low 48 bits are 0.
*4882a593Smuzhiyun	ext		v0.16b, v2.16b, v7.16b, #8
*4882a593Smuzhiyun	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
*4882a593Smuzhiyun	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
*4882a593Smuzhiyun	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
*4882a593Smuzhiyun	mov		v0.s[3], v2.s[0]	// zero high 32 bits
*4882a593Smuzhiyun	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v1.16b	// + low bits
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Load G(x) and floor(x^48 / G(x)).
*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr]
*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Use Barrett reduction to compute the final CRC value.
*4882a593Smuzhiyun	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
*4882a593Smuzhiyun	ushr		v1.2d, v1.2d, #32	// /= x^32
*4882a593Smuzhiyun	__pmull_\p	v1, v1, fold_consts	// *= G(x)
*4882a593Smuzhiyun	ushr		v0.2d, v0.2d, #48
*4882a593Smuzhiyun	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
*4882a593Smuzhiyun	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
*4882a593Smuzhiyun
*4882a593Smuzhiyun	umov		w0, v0.h[0]
*4882a593Smuzhiyun	.ifc		\p, p8
*4882a593Smuzhiyun	ldp		x29, x30, [sp], #16
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	ret
*4882a593Smuzhiyun
*4882a593Smuzhiyun.Lless_than_256_bytes_\@:
*4882a593Smuzhiyun	// Checksumming a buffer of length 16...255 bytes
*4882a593Smuzhiyun
*4882a593Smuzhiyun	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Load the first 16 data bytes.
*4882a593Smuzhiyun	ldr		q7, [buf], #0x10
*4882a593SmuzhiyunCPU_LE(	rev64		v7.16b, v7.16b			)
*4882a593SmuzhiyunCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// XOR the first 16 data *bits* with the initial CRC value.
*4882a593Smuzhiyun	movi		v0.16b, #0
*4882a593Smuzhiyun	mov		v0.h[7], init_crc
*4882a593Smuzhiyun	eor		v7.16b, v7.16b, v0.16b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	// Load the fold-across-16-bytes constants.
*4882a593Smuzhiyun	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
*4882a593Smuzhiyun	__pmull_pre_\p	fold_consts
*4882a593Smuzhiyun
*4882a593Smuzhiyun	cmp		len, #16
*4882a593Smuzhiyun	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
*4882a593Smuzhiyun	subs		len, len, #32
*4882a593Smuzhiyun	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
*4882a593Smuzhiyun	add		len, len, #16
*4882a593Smuzhiyun	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Assumes len >= 16.
*4882a593Smuzhiyun//
*4882a593SmuzhiyunSYM_FUNC_START(crc_t10dif_pmull_p8)
*4882a593Smuzhiyun	stp		x29, x30, [sp, #-16]!
*4882a593Smuzhiyun	mov		x29, sp
*4882a593Smuzhiyun	crc_t10dif_pmull p8
*4882a593SmuzhiyunSYM_FUNC_END(crc_t10dif_pmull_p8)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.align		5
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
*4882a593Smuzhiyun//
*4882a593Smuzhiyun// Assumes len >= 16.
*4882a593Smuzhiyun//
*4882a593SmuzhiyunSYM_FUNC_START(crc_t10dif_pmull_p64)
*4882a593Smuzhiyun	crc_t10dif_pmull	p64
*4882a593SmuzhiyunSYM_FUNC_END(crc_t10dif_pmull_p64)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.section	".rodata", "a"
*4882a593Smuzhiyun	.align		4
*4882a593Smuzhiyun
*4882a593Smuzhiyun// Fold constants precomputed from the polynomial 0x18bb7
*4882a593Smuzhiyun// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
*4882a593Smuzhiyun.Lfold_across_128_bytes_consts:
*4882a593Smuzhiyun	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
*4882a593Smuzhiyun	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
*4882a593Smuzhiyun// .Lfold_across_64_bytes_consts:
*4882a593Smuzhiyun	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
*4882a593Smuzhiyun	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
*4882a593Smuzhiyun// .Lfold_across_32_bytes_consts:
*4882a593Smuzhiyun	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
*4882a593Smuzhiyun	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
*4882a593Smuzhiyun.Lfold_across_16_bytes_consts:
*4882a593Smuzhiyun	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
*4882a593Smuzhiyun	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
*4882a593Smuzhiyun// .Lfinal_fold_consts:
*4882a593Smuzhiyun	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
*4882a593Smuzhiyun	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
*4882a593Smuzhiyun// .Lbarrett_reduction_consts:
*4882a593Smuzhiyun	.quad		0x0000000000018bb7	// G(x)
*4882a593Smuzhiyun	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
*4882a593Smuzhiyun
*4882a593Smuzhiyun// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
*4882a593Smuzhiyun// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
*4882a593Smuzhiyun// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
*4882a593Smuzhiyun.Lbyteshift_table:
*4882a593Smuzhiyun	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
*4882a593Smuzhiyun	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
*4882a593Smuzhiyun	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
*4882a593Smuzhiyun	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0