1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 4*4882a593Smuzhiyun * instructions. This file contains accelerated part of ghash 5*4882a593Smuzhiyun * implementation. More information about PCLMULQDQ can be found at: 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * Copyright (c) 2009 Intel Corp. 10*4882a593Smuzhiyun * Author: Huang Ying <ying.huang@intel.com> 11*4882a593Smuzhiyun * Vinodh Gopal 12*4882a593Smuzhiyun * Erdinc Ozturk 13*4882a593Smuzhiyun * Deniz Karakoyunlu 14*4882a593Smuzhiyun */ 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun#include <linux/linkage.h> 17*4882a593Smuzhiyun#include <asm/frame.h> 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 20*4882a593Smuzhiyun.align 16 21*4882a593Smuzhiyun.Lbswap_mask: 22*4882a593Smuzhiyun .octa 0x000102030405060708090a0b0c0d0e0f 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun#define DATA %xmm0 25*4882a593Smuzhiyun#define SHASH %xmm1 26*4882a593Smuzhiyun#define T1 %xmm2 27*4882a593Smuzhiyun#define T2 %xmm3 28*4882a593Smuzhiyun#define T3 %xmm4 29*4882a593Smuzhiyun#define BSWAP %xmm5 30*4882a593Smuzhiyun#define IN1 %xmm6 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun.text 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun/* 35*4882a593Smuzhiyun * __clmul_gf128mul_ble: internal ABI 36*4882a593Smuzhiyun * input: 37*4882a593Smuzhiyun * DATA: operand1 38*4882a593Smuzhiyun * SHASH: operand2, hash_key << 1 mod poly 39*4882a593Smuzhiyun * output: 40*4882a593Smuzhiyun * DATA: operand1 * operand2 mod poly 41*4882a593Smuzhiyun * changed: 42*4882a593Smuzhiyun * T1 43*4882a593Smuzhiyun * T2 44*4882a593Smuzhiyun * T3 45*4882a593Smuzhiyun */ 46*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) 47*4882a593Smuzhiyun movaps DATA, T1 48*4882a593Smuzhiyun pshufd $0b01001110, DATA, T2 49*4882a593Smuzhiyun pshufd $0b01001110, SHASH, T3 50*4882a593Smuzhiyun pxor DATA, T2 51*4882a593Smuzhiyun pxor SHASH, T3 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 54*4882a593Smuzhiyun pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 55*4882a593Smuzhiyun pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) 56*4882a593Smuzhiyun pxor DATA, T2 57*4882a593Smuzhiyun pxor T1, T2 # T2 = a0 * b1 + a1 * b0 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun movaps T2, T3 60*4882a593Smuzhiyun pslldq $8, T3 61*4882a593Smuzhiyun psrldq $8, T2 62*4882a593Smuzhiyun pxor T3, DATA 63*4882a593Smuzhiyun pxor T2, T1 # <T1:DATA> is result of 64*4882a593Smuzhiyun # carry-less multiplication 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun # first phase of the reduction 67*4882a593Smuzhiyun movaps DATA, T3 68*4882a593Smuzhiyun psllq $1, T3 69*4882a593Smuzhiyun pxor DATA, T3 70*4882a593Smuzhiyun psllq $5, T3 71*4882a593Smuzhiyun pxor DATA, T3 72*4882a593Smuzhiyun psllq $57, T3 73*4882a593Smuzhiyun movaps T3, T2 74*4882a593Smuzhiyun pslldq $8, T2 75*4882a593Smuzhiyun psrldq $8, T3 76*4882a593Smuzhiyun pxor T2, DATA 77*4882a593Smuzhiyun pxor T3, T1 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun # second phase of the reduction 80*4882a593Smuzhiyun movaps DATA, T2 81*4882a593Smuzhiyun psrlq $5, T2 82*4882a593Smuzhiyun pxor DATA, T2 83*4882a593Smuzhiyun psrlq $1, T2 84*4882a593Smuzhiyun pxor DATA, T2 85*4882a593Smuzhiyun psrlq $1, T2 86*4882a593Smuzhiyun pxor T2, T1 87*4882a593Smuzhiyun pxor T1, DATA 88*4882a593Smuzhiyun RET 89*4882a593SmuzhiyunSYM_FUNC_END(__clmul_gf128mul_ble) 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun/* void clmul_ghash_mul(char *dst, const u128 *shash) */ 92*4882a593SmuzhiyunSYM_FUNC_START(clmul_ghash_mul) 93*4882a593Smuzhiyun FRAME_BEGIN 94*4882a593Smuzhiyun movups (%rdi), DATA 95*4882a593Smuzhiyun movups (%rsi), SHASH 96*4882a593Smuzhiyun movaps .Lbswap_mask, BSWAP 97*4882a593Smuzhiyun pshufb BSWAP, DATA 98*4882a593Smuzhiyun call __clmul_gf128mul_ble 99*4882a593Smuzhiyun pshufb BSWAP, DATA 100*4882a593Smuzhiyun movups DATA, (%rdi) 101*4882a593Smuzhiyun FRAME_END 102*4882a593Smuzhiyun RET 103*4882a593SmuzhiyunSYM_FUNC_END(clmul_ghash_mul) 104*4882a593Smuzhiyun 105*4882a593Smuzhiyun/* 106*4882a593Smuzhiyun * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 107*4882a593Smuzhiyun * const u128 *shash); 108*4882a593Smuzhiyun */ 109*4882a593SmuzhiyunSYM_FUNC_START(clmul_ghash_update) 110*4882a593Smuzhiyun FRAME_BEGIN 111*4882a593Smuzhiyun cmp $16, %rdx 112*4882a593Smuzhiyun jb .Lupdate_just_ret # check length 113*4882a593Smuzhiyun movaps .Lbswap_mask, BSWAP 114*4882a593Smuzhiyun movups (%rdi), DATA 115*4882a593Smuzhiyun movups (%rcx), SHASH 116*4882a593Smuzhiyun pshufb BSWAP, DATA 117*4882a593Smuzhiyun.align 4 118*4882a593Smuzhiyun.Lupdate_loop: 119*4882a593Smuzhiyun movups (%rsi), IN1 120*4882a593Smuzhiyun pshufb BSWAP, IN1 121*4882a593Smuzhiyun pxor IN1, DATA 122*4882a593Smuzhiyun call __clmul_gf128mul_ble 123*4882a593Smuzhiyun sub $16, %rdx 124*4882a593Smuzhiyun add $16, %rsi 125*4882a593Smuzhiyun cmp $16, %rdx 126*4882a593Smuzhiyun jge .Lupdate_loop 127*4882a593Smuzhiyun pshufb BSWAP, DATA 128*4882a593Smuzhiyun movups DATA, (%rdi) 129*4882a593Smuzhiyun.Lupdate_just_ret: 130*4882a593Smuzhiyun FRAME_END 131*4882a593Smuzhiyun RET 132*4882a593SmuzhiyunSYM_FUNC_END(clmul_ghash_update) 133