xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/ghash-clmulni-intel_asm.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
4*4882a593Smuzhiyun * instructions. This file contains accelerated part of ghash
5*4882a593Smuzhiyun * implementation. More information about PCLMULQDQ can be found at:
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Copyright (c) 2009 Intel Corp.
10*4882a593Smuzhiyun *   Author: Huang Ying <ying.huang@intel.com>
11*4882a593Smuzhiyun *	     Vinodh Gopal
12*4882a593Smuzhiyun *	     Erdinc Ozturk
13*4882a593Smuzhiyun *	     Deniz Karakoyunlu
14*4882a593Smuzhiyun */
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun#include <linux/linkage.h>
17*4882a593Smuzhiyun#include <asm/frame.h>
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
20*4882a593Smuzhiyun.align 16
21*4882a593Smuzhiyun.Lbswap_mask:
22*4882a593Smuzhiyun	.octa 0x000102030405060708090a0b0c0d0e0f
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun#define DATA	%xmm0
25*4882a593Smuzhiyun#define SHASH	%xmm1
26*4882a593Smuzhiyun#define T1	%xmm2
27*4882a593Smuzhiyun#define T2	%xmm3
28*4882a593Smuzhiyun#define T3	%xmm4
29*4882a593Smuzhiyun#define BSWAP	%xmm5
30*4882a593Smuzhiyun#define IN1	%xmm6
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun.text
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun/*
35*4882a593Smuzhiyun * __clmul_gf128mul_ble:	internal ABI
36*4882a593Smuzhiyun * input:
37*4882a593Smuzhiyun *	DATA:			operand1
38*4882a593Smuzhiyun *	SHASH:			operand2, hash_key << 1 mod poly
39*4882a593Smuzhiyun * output:
40*4882a593Smuzhiyun *	DATA:			operand1 * operand2 mod poly
41*4882a593Smuzhiyun * changed:
42*4882a593Smuzhiyun *	T1
43*4882a593Smuzhiyun *	T2
44*4882a593Smuzhiyun *	T3
45*4882a593Smuzhiyun */
46*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
47*4882a593Smuzhiyun	movaps DATA, T1
48*4882a593Smuzhiyun	pshufd $0b01001110, DATA, T2
49*4882a593Smuzhiyun	pshufd $0b01001110, SHASH, T3
50*4882a593Smuzhiyun	pxor DATA, T2
51*4882a593Smuzhiyun	pxor SHASH, T3
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun	pclmulqdq $0x00, SHASH, DATA	# DATA = a0 * b0
54*4882a593Smuzhiyun	pclmulqdq $0x11, SHASH, T1	# T1 = a1 * b1
55*4882a593Smuzhiyun	pclmulqdq $0x00, T3, T2		# T2 = (a1 + a0) * (b1 + b0)
56*4882a593Smuzhiyun	pxor DATA, T2
57*4882a593Smuzhiyun	pxor T1, T2			# T2 = a0 * b1 + a1 * b0
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun	movaps T2, T3
60*4882a593Smuzhiyun	pslldq $8, T3
61*4882a593Smuzhiyun	psrldq $8, T2
62*4882a593Smuzhiyun	pxor T3, DATA
63*4882a593Smuzhiyun	pxor T2, T1			# <T1:DATA> is result of
64*4882a593Smuzhiyun					# carry-less multiplication
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun	# first phase of the reduction
67*4882a593Smuzhiyun	movaps DATA, T3
68*4882a593Smuzhiyun	psllq $1, T3
69*4882a593Smuzhiyun	pxor DATA, T3
70*4882a593Smuzhiyun	psllq $5, T3
71*4882a593Smuzhiyun	pxor DATA, T3
72*4882a593Smuzhiyun	psllq $57, T3
73*4882a593Smuzhiyun	movaps T3, T2
74*4882a593Smuzhiyun	pslldq $8, T2
75*4882a593Smuzhiyun	psrldq $8, T3
76*4882a593Smuzhiyun	pxor T2, DATA
77*4882a593Smuzhiyun	pxor T3, T1
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun	# second phase of the reduction
80*4882a593Smuzhiyun	movaps DATA, T2
81*4882a593Smuzhiyun	psrlq $5, T2
82*4882a593Smuzhiyun	pxor DATA, T2
83*4882a593Smuzhiyun	psrlq $1, T2
84*4882a593Smuzhiyun	pxor DATA, T2
85*4882a593Smuzhiyun	psrlq $1, T2
86*4882a593Smuzhiyun	pxor T2, T1
87*4882a593Smuzhiyun	pxor T1, DATA
88*4882a593Smuzhiyun	RET
89*4882a593SmuzhiyunSYM_FUNC_END(__clmul_gf128mul_ble)
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun/* void clmul_ghash_mul(char *dst, const u128 *shash) */
92*4882a593SmuzhiyunSYM_FUNC_START(clmul_ghash_mul)
93*4882a593Smuzhiyun	FRAME_BEGIN
94*4882a593Smuzhiyun	movups (%rdi), DATA
95*4882a593Smuzhiyun	movups (%rsi), SHASH
96*4882a593Smuzhiyun	movaps .Lbswap_mask, BSWAP
97*4882a593Smuzhiyun	pshufb BSWAP, DATA
98*4882a593Smuzhiyun	call __clmul_gf128mul_ble
99*4882a593Smuzhiyun	pshufb BSWAP, DATA
100*4882a593Smuzhiyun	movups DATA, (%rdi)
101*4882a593Smuzhiyun	FRAME_END
102*4882a593Smuzhiyun	RET
103*4882a593SmuzhiyunSYM_FUNC_END(clmul_ghash_mul)
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun/*
106*4882a593Smuzhiyun * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
107*4882a593Smuzhiyun *			   const u128 *shash);
108*4882a593Smuzhiyun */
109*4882a593SmuzhiyunSYM_FUNC_START(clmul_ghash_update)
110*4882a593Smuzhiyun	FRAME_BEGIN
111*4882a593Smuzhiyun	cmp $16, %rdx
112*4882a593Smuzhiyun	jb .Lupdate_just_ret	# check length
113*4882a593Smuzhiyun	movaps .Lbswap_mask, BSWAP
114*4882a593Smuzhiyun	movups (%rdi), DATA
115*4882a593Smuzhiyun	movups (%rcx), SHASH
116*4882a593Smuzhiyun	pshufb BSWAP, DATA
117*4882a593Smuzhiyun.align 4
118*4882a593Smuzhiyun.Lupdate_loop:
119*4882a593Smuzhiyun	movups (%rsi), IN1
120*4882a593Smuzhiyun	pshufb BSWAP, IN1
121*4882a593Smuzhiyun	pxor IN1, DATA
122*4882a593Smuzhiyun	call __clmul_gf128mul_ble
123*4882a593Smuzhiyun	sub $16, %rdx
124*4882a593Smuzhiyun	add $16, %rsi
125*4882a593Smuzhiyun	cmp $16, %rdx
126*4882a593Smuzhiyun	jge .Lupdate_loop
127*4882a593Smuzhiyun	pshufb BSWAP, DATA
128*4882a593Smuzhiyun	movups DATA, (%rdi)
129*4882a593Smuzhiyun.Lupdate_just_ret:
130*4882a593Smuzhiyun	FRAME_END
131*4882a593Smuzhiyun	RET
132*4882a593SmuzhiyunSYM_FUNC_END(clmul_ghash_update)
133