arm/crypto/ghash-ce-core.S

*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*4882a593Smuzhiyun */
*4882a593Smuzhiyun
*4882a593Smuzhiyun#include <linux/linkage.h>
*4882a593Smuzhiyun#include <asm/assembler.h>
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.arch		armv8-a
*4882a593Smuzhiyun	.fpu		crypto-neon-fp-armv8
*4882a593Smuzhiyun
*4882a593Smuzhiyun	SHASH		.req	q0
*4882a593Smuzhiyun	T1		.req	q1
*4882a593Smuzhiyun	XL		.req	q2
*4882a593Smuzhiyun	XM		.req	q3
*4882a593Smuzhiyun	XH		.req	q4
*4882a593Smuzhiyun	IN1		.req	q4
*4882a593Smuzhiyun
*4882a593Smuzhiyun	SHASH_L		.req	d0
*4882a593Smuzhiyun	SHASH_H		.req	d1
*4882a593Smuzhiyun	T1_L		.req	d2
*4882a593Smuzhiyun	T1_H		.req	d3
*4882a593Smuzhiyun	XL_L		.req	d4
*4882a593Smuzhiyun	XL_H		.req	d5
*4882a593Smuzhiyun	XM_L		.req	d6
*4882a593Smuzhiyun	XM_H		.req	d7
*4882a593Smuzhiyun	XH_L		.req	d8
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t0l		.req	d10
*4882a593Smuzhiyun	t0h		.req	d11
*4882a593Smuzhiyun	t1l		.req	d12
*4882a593Smuzhiyun	t1h		.req	d13
*4882a593Smuzhiyun	t2l		.req	d14
*4882a593Smuzhiyun	t2h		.req	d15
*4882a593Smuzhiyun	t3l		.req	d16
*4882a593Smuzhiyun	t3h		.req	d17
*4882a593Smuzhiyun	t4l		.req	d18
*4882a593Smuzhiyun	t4h		.req	d19
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t0q		.req	q5
*4882a593Smuzhiyun	t1q		.req	q6
*4882a593Smuzhiyun	t2q		.req	q7
*4882a593Smuzhiyun	t3q		.req	q8
*4882a593Smuzhiyun	t4q		.req	q9
*4882a593Smuzhiyun	T2		.req	q9
*4882a593Smuzhiyun
*4882a593Smuzhiyun	s1l		.req	d20
*4882a593Smuzhiyun	s1h		.req	d21
*4882a593Smuzhiyun	s2l		.req	d22
*4882a593Smuzhiyun	s2h		.req	d23
*4882a593Smuzhiyun	s3l		.req	d24
*4882a593Smuzhiyun	s3h		.req	d25
*4882a593Smuzhiyun	s4l		.req	d26
*4882a593Smuzhiyun	s4h		.req	d27
*4882a593Smuzhiyun
*4882a593Smuzhiyun	MASK		.req	d28
*4882a593Smuzhiyun	SHASH2_p8	.req	d28
*4882a593Smuzhiyun
*4882a593Smuzhiyun	k16		.req	d29
*4882a593Smuzhiyun	k32		.req	d30
*4882a593Smuzhiyun	k48		.req	d31
*4882a593Smuzhiyun	SHASH2_p64	.req	d31
*4882a593Smuzhiyun
*4882a593Smuzhiyun	HH		.req	q10
*4882a593Smuzhiyun	HH3		.req	q11
*4882a593Smuzhiyun	HH4		.req	q12
*4882a593Smuzhiyun	HH34		.req	q13
*4882a593Smuzhiyun
*4882a593Smuzhiyun	HH_L		.req	d20
*4882a593Smuzhiyun	HH_H		.req	d21
*4882a593Smuzhiyun	HH3_L		.req	d22
*4882a593Smuzhiyun	HH3_H		.req	d23
*4882a593Smuzhiyun	HH4_L		.req	d24
*4882a593Smuzhiyun	HH4_H		.req	d25
*4882a593Smuzhiyun	HH34_L		.req	d26
*4882a593Smuzhiyun	HH34_H		.req	d27
*4882a593Smuzhiyun	SHASH2_H	.req	d29
*4882a593Smuzhiyun
*4882a593Smuzhiyun	XL2		.req	q5
*4882a593Smuzhiyun	XM2		.req	q6
*4882a593Smuzhiyun	XH2		.req	q7
*4882a593Smuzhiyun	T3		.req	q8
*4882a593Smuzhiyun
*4882a593Smuzhiyun	XL2_L		.req	d10
*4882a593Smuzhiyun	XL2_H		.req	d11
*4882a593Smuzhiyun	XM2_L		.req	d12
*4882a593Smuzhiyun	XM2_H		.req	d13
*4882a593Smuzhiyun	T3_L		.req	d16
*4882a593Smuzhiyun	T3_H		.req	d17
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.text
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
*4882a593Smuzhiyun	vmull.p64	\rd, \rn, \rm
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * This implementation of 64x64 -> 128 bit polynomial multiplication
*4882a593Smuzhiyun	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
*4882a593Smuzhiyun	 * "Fast Software Polynomial Multiplication on ARM Processors Using
*4882a593Smuzhiyun	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
*4882a593Smuzhiyun	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * It has been slightly tweaked for in-order performance, and to allow
*4882a593Smuzhiyun	 * 'rq' to overlap with 'ad' or 'bd'.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
*4882a593Smuzhiyun	vext.8		t0l, \ad, \ad, #1	@ A1
*4882a593Smuzhiyun	.ifc		\b1, t4l
*4882a593Smuzhiyun	vext.8		t4l, \bd, \bd, #1	@ B1
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	vmull.p8	t0q, t0l, \bd		@ F = A1*B
*4882a593Smuzhiyun	vext.8		t1l, \ad, \ad, #2	@ A2
*4882a593Smuzhiyun	vmull.p8	t4q, \ad, \b1		@ E = A*B1
*4882a593Smuzhiyun	.ifc		\b2, t3l
*4882a593Smuzhiyun	vext.8		t3l, \bd, \bd, #2	@ B2
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	vmull.p8	t1q, t1l, \bd		@ H = A2*B
*4882a593Smuzhiyun	vext.8		t2l, \ad, \ad, #3	@ A3
*4882a593Smuzhiyun	vmull.p8	t3q, \ad, \b2		@ G = A*B2
*4882a593Smuzhiyun	veor		t0q, t0q, t4q		@ L = E + F
*4882a593Smuzhiyun	.ifc		\b3, t4l
*4882a593Smuzhiyun	vext.8		t4l, \bd, \bd, #3	@ B3
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	vmull.p8	t2q, t2l, \bd		@ J = A3*B
*4882a593Smuzhiyun	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
*4882a593Smuzhiyun	veor		t1q, t1q, t3q		@ M = G + H
*4882a593Smuzhiyun	.ifc		\b4, t3l
*4882a593Smuzhiyun	vext.8		t3l, \bd, \bd, #4	@ B4
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun	vmull.p8	t4q, \ad, \b3		@ I = A*B3
*4882a593Smuzhiyun	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
*4882a593Smuzhiyun	vmull.p8	t3q, \ad, \b4		@ K = A*B4
*4882a593Smuzhiyun	vand		t0h, t0h, k48
*4882a593Smuzhiyun	vand		t1h, t1h, k32
*4882a593Smuzhiyun	veor		t2q, t2q, t4q		@ N = I + J
*4882a593Smuzhiyun	veor		t0l, t0l, t0h
*4882a593Smuzhiyun	veor		t1l, t1l, t1h
*4882a593Smuzhiyun	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
*4882a593Smuzhiyun	vand		t2h, t2h, k16
*4882a593Smuzhiyun	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
*4882a593Smuzhiyun	vmov.i64	t3h, #0
*4882a593Smuzhiyun	vext.8		t0q, t0q, t0q, #15
*4882a593Smuzhiyun	veor		t2l, t2l, t2h
*4882a593Smuzhiyun	vext.8		t1q, t1q, t1q, #14
*4882a593Smuzhiyun	vmull.p8	\rq, \ad, \bd		@ D = A*B
*4882a593Smuzhiyun	vext.8		t2q, t2q, t2q, #13
*4882a593Smuzhiyun	vext.8		t3q, t3q, t3q, #12
*4882a593Smuzhiyun	veor		t0q, t0q, t1q
*4882a593Smuzhiyun	veor		t2q, t2q, t3q
*4882a593Smuzhiyun	veor		\rq, \rq, t0q
*4882a593Smuzhiyun	veor		\rq, \rq, t2q
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	//
*4882a593Smuzhiyun	// PMULL (64x64->128) based reduction for CPUs that can do
*4882a593Smuzhiyun	// it in a single instruction.
*4882a593Smuzhiyun	//
*4882a593Smuzhiyun	.macro		__pmull_reduce_p64
*4882a593Smuzhiyun	vmull.p64	T1, XL_L, MASK
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		XH_L, XH_L, XM_H
*4882a593Smuzhiyun	vext.8		T1, T1, T1, #8
*4882a593Smuzhiyun	veor		XL_H, XL_H, XM_L
*4882a593Smuzhiyun	veor		T1, T1, XL
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmull.p64	XL, T1_H, MASK
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	//
*4882a593Smuzhiyun	// Alternative reduction for CPUs that lack support for the
*4882a593Smuzhiyun	// 64x64->128 PMULL instruction
*4882a593Smuzhiyun	//
*4882a593Smuzhiyun	.macro		__pmull_reduce_p8
*4882a593Smuzhiyun	veor		XL_H, XL_H, XM_L
*4882a593Smuzhiyun	veor		XH_L, XH_L, XM_H
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vshl.i64	T1, XL, #57
*4882a593Smuzhiyun	vshl.i64	T2, XL, #62
*4882a593Smuzhiyun	veor		T1, T1, T2
*4882a593Smuzhiyun	vshl.i64	T2, XL, #63
*4882a593Smuzhiyun	veor		T1, T1, T2
*4882a593Smuzhiyun	veor		XL_H, XL_H, T1_L
*4882a593Smuzhiyun	veor		XH_L, XH_L, T1_H
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vshr.u64	T1, XL, #1
*4882a593Smuzhiyun	veor		XH, XH, XL
*4882a593Smuzhiyun	veor		XL, XL, T1
*4882a593Smuzhiyun	vshr.u64	T1, T1, #6
*4882a593Smuzhiyun	vshr.u64	XL, XL, #1
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	.macro		ghash_update, pn
*4882a593Smuzhiyun	vld1.64		{XL}, [r1]
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* do the head block first, if supplied */
*4882a593Smuzhiyun	ldr		ip, [sp]
*4882a593Smuzhiyun	teq		ip, #0
*4882a593Smuzhiyun	beq		0f
*4882a593Smuzhiyun	vld1.64		{T1}, [ip]
*4882a593Smuzhiyun	teq		r0, #0
*4882a593Smuzhiyun	b		3f
*4882a593Smuzhiyun
*4882a593Smuzhiyun0:	.ifc		\pn, p64
*4882a593Smuzhiyun	tst		r0, #3			// skip until #blocks is a
*4882a593Smuzhiyun	bne		2f			// round multiple of 4
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vld1.8		{XL2-XM2}, [r2]!
*4882a593Smuzhiyun1:	vld1.8		{T3-T2}, [r2]!
*4882a593Smuzhiyun	vrev64.8	XL2, XL2
*4882a593Smuzhiyun	vrev64.8	XM2, XM2
*4882a593Smuzhiyun
*4882a593Smuzhiyun	subs		r0, r0, #4
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vext.8		T1, XL2, XL2, #8
*4882a593Smuzhiyun	veor		XL2_H, XL2_H, XL_L
*4882a593Smuzhiyun	veor		XL, XL, T1
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vrev64.8	T3, T3
*4882a593Smuzhiyun	vrev64.8	T1, T2
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
*4882a593Smuzhiyun	veor		XL2_H, XL2_H, XL_H
*4882a593Smuzhiyun	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
*4882a593Smuzhiyun	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
*4882a593Smuzhiyun	veor		XM2_L, XM2_L, XM2_H
*4882a593Smuzhiyun	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
*4882a593Smuzhiyun	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		XH, XH, XH2
*4882a593Smuzhiyun	veor		XL, XL, XL2
*4882a593Smuzhiyun	veor		XM, XM, XM2
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
*4882a593Smuzhiyun	veor		T3_L, T3_L, T3_H
*4882a593Smuzhiyun	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
*4882a593Smuzhiyun	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		XH, XH, XH2
*4882a593Smuzhiyun	veor		XL, XL, XL2
*4882a593Smuzhiyun	veor		XM, XM, XM2
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
*4882a593Smuzhiyun	veor		T1_L, T1_L, T1_H
*4882a593Smuzhiyun	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
*4882a593Smuzhiyun	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		XH, XH, XH2
*4882a593Smuzhiyun	veor		XL, XL, XL2
*4882a593Smuzhiyun	veor		XM, XM, XM2
*4882a593Smuzhiyun
*4882a593Smuzhiyun	beq		4f
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vld1.8		{XL2-XM2}, [r2]!
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		T1, XL, XH
*4882a593Smuzhiyun	veor		XM, XM, T1
*4882a593Smuzhiyun
*4882a593Smuzhiyun	__pmull_reduce_p64
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		T1, T1, XH
*4882a593Smuzhiyun	veor		XL, XL, T1
*4882a593Smuzhiyun
*4882a593Smuzhiyun	b		1b
*4882a593Smuzhiyun	.endif
*4882a593Smuzhiyun
*4882a593Smuzhiyun2:	vld1.64		{T1}, [r2]!
*4882a593Smuzhiyun	subs		r0, r0, #1
*4882a593Smuzhiyun
*4882a593Smuzhiyun3:	/* multiply XL by SHASH in GF(2^128) */
*4882a593Smuzhiyun#ifndef CONFIG_CPU_BIG_ENDIAN
*4882a593Smuzhiyun	vrev64.8	T1, T1
*4882a593Smuzhiyun#endif
*4882a593Smuzhiyun	vext.8		IN1, T1, T1, #8
*4882a593Smuzhiyun	veor		T1_L, T1_L, XL_H
*4882a593Smuzhiyun	veor		XL, XL, IN1
*4882a593Smuzhiyun
*4882a593Smuzhiyun	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
*4882a593Smuzhiyun	veor		T1, T1, XL
*4882a593Smuzhiyun	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
*4882a593Smuzhiyun	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun4:	veor		T1, XL, XH
*4882a593Smuzhiyun	veor		XM, XM, T1
*4882a593Smuzhiyun
*4882a593Smuzhiyun	__pmull_reduce_\pn
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		T1, T1, XH
*4882a593Smuzhiyun	veor		XL, XL, T1
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bne		0b
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vst1.64		{XL}, [r1]
*4882a593Smuzhiyun	bx		lr
*4882a593Smuzhiyun	.endm
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
*4882a593Smuzhiyun	 *			   struct ghash_key const *k, const char *head)
*4882a593Smuzhiyun	 */
*4882a593SmuzhiyunENTRY(pmull_ghash_update_p64)
*4882a593Smuzhiyun	vld1.64		{SHASH}, [r3]!
*4882a593Smuzhiyun	vld1.64		{HH}, [r3]!
*4882a593Smuzhiyun	vld1.64		{HH3-HH4}, [r3]
*4882a593Smuzhiyun
*4882a593Smuzhiyun	veor		SHASH2_p64, SHASH_L, SHASH_H
*4882a593Smuzhiyun	veor		SHASH2_H, HH_L, HH_H
*4882a593Smuzhiyun	veor		HH34_L, HH3_L, HH3_H
*4882a593Smuzhiyun	veor		HH34_H, HH4_L, HH4_H
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmov.i8		MASK, #0xe1
*4882a593Smuzhiyun	vshl.u64	MASK, MASK, #57
*4882a593Smuzhiyun
*4882a593Smuzhiyun	ghash_update	p64
*4882a593SmuzhiyunENDPROC(pmull_ghash_update_p64)
*4882a593Smuzhiyun
*4882a593SmuzhiyunENTRY(pmull_ghash_update_p8)
*4882a593Smuzhiyun	vld1.64		{SHASH}, [r3]
*4882a593Smuzhiyun	veor		SHASH2_p8, SHASH_L, SHASH_H
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vext.8		s1l, SHASH_L, SHASH_L, #1
*4882a593Smuzhiyun	vext.8		s2l, SHASH_L, SHASH_L, #2
*4882a593Smuzhiyun	vext.8		s3l, SHASH_L, SHASH_L, #3
*4882a593Smuzhiyun	vext.8		s4l, SHASH_L, SHASH_L, #4
*4882a593Smuzhiyun	vext.8		s1h, SHASH_H, SHASH_H, #1
*4882a593Smuzhiyun	vext.8		s2h, SHASH_H, SHASH_H, #2
*4882a593Smuzhiyun	vext.8		s3h, SHASH_H, SHASH_H, #3
*4882a593Smuzhiyun	vext.8		s4h, SHASH_H, SHASH_H, #4
*4882a593Smuzhiyun
*4882a593Smuzhiyun	vmov.i64	k16, #0xffff
*4882a593Smuzhiyun	vmov.i64	k32, #0xffffffff
*4882a593Smuzhiyun	vmov.i64	k48, #0xffffffffffff
*4882a593Smuzhiyun
*4882a593Smuzhiyun	ghash_update	p8
*4882a593SmuzhiyunENDPROC(pmull_ghash_update_p8)