1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 4*4882a593Smuzhiyun * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 5*4882a593Smuzhiyun * processors. CPUs supporting Intel(R) AVX extensions will get an additional 6*4882a593Smuzhiyun * boost. 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * This work was inspired by the vectorized implementation of Dean Gaudet. 9*4882a593Smuzhiyun * Additional information on it can be found at: 10*4882a593Smuzhiyun * http://www.arctic.org/~dean/crypto/sha1.html 11*4882a593Smuzhiyun * 12*4882a593Smuzhiyun * It was improved upon with more efficient vectorization of the message 13*4882a593Smuzhiyun * scheduling. This implementation has also been optimized for all current and 14*4882a593Smuzhiyun * several future generations of Intel CPUs. 15*4882a593Smuzhiyun * 16*4882a593Smuzhiyun * See this article for more information about the implementation details: 17*4882a593Smuzhiyun * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 18*4882a593Smuzhiyun * 19*4882a593Smuzhiyun * Copyright (C) 2010, Intel Corp. 20*4882a593Smuzhiyun * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 21*4882a593Smuzhiyun * Ronen Zohar <ronen.zohar@intel.com> 22*4882a593Smuzhiyun * 23*4882a593Smuzhiyun * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 24*4882a593Smuzhiyun * Author: Mathias Krause <minipli@googlemail.com> 25*4882a593Smuzhiyun */ 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun#include <linux/linkage.h> 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun#define CTX %rdi // arg1 30*4882a593Smuzhiyun#define BUF %rsi // arg2 31*4882a593Smuzhiyun#define CNT %rdx // arg3 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun#define REG_A %ecx 34*4882a593Smuzhiyun#define REG_B %esi 35*4882a593Smuzhiyun#define REG_C %edi 36*4882a593Smuzhiyun#define REG_D %r12d 37*4882a593Smuzhiyun#define REG_E %edx 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define REG_T1 %eax 40*4882a593Smuzhiyun#define REG_T2 %ebx 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun#define K_BASE %r8 43*4882a593Smuzhiyun#define HASH_PTR %r9 44*4882a593Smuzhiyun#define BUFFER_PTR %r10 45*4882a593Smuzhiyun#define BUFFER_END %r11 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun#define W_TMP1 %xmm0 48*4882a593Smuzhiyun#define W_TMP2 %xmm9 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun#define W0 %xmm1 51*4882a593Smuzhiyun#define W4 %xmm2 52*4882a593Smuzhiyun#define W8 %xmm3 53*4882a593Smuzhiyun#define W12 %xmm4 54*4882a593Smuzhiyun#define W16 %xmm5 55*4882a593Smuzhiyun#define W20 %xmm6 56*4882a593Smuzhiyun#define W24 %xmm7 57*4882a593Smuzhiyun#define W28 %xmm8 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun#define XMM_SHUFB_BSWAP %xmm10 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 62*4882a593Smuzhiyun#define WK(t) (((t) & 15) * 4)(%rsp) 63*4882a593Smuzhiyun#define W_PRECALC_AHEAD 16 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun/* 66*4882a593Smuzhiyun * This macro implements the SHA-1 function's body for single 64-byte block 67*4882a593Smuzhiyun * param: function's name 68*4882a593Smuzhiyun */ 69*4882a593Smuzhiyun.macro SHA1_VECTOR_ASM name 70*4882a593Smuzhiyun SYM_FUNC_START(\name) 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun push %rbx 73*4882a593Smuzhiyun push %r12 74*4882a593Smuzhiyun push %rbp 75*4882a593Smuzhiyun mov %rsp, %rbp 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun sub $64, %rsp # allocate workspace 78*4882a593Smuzhiyun and $~15, %rsp # align stack 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun mov CTX, HASH_PTR 81*4882a593Smuzhiyun mov BUF, BUFFER_PTR 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun shl $6, CNT # multiply by 64 84*4882a593Smuzhiyun add BUF, CNT 85*4882a593Smuzhiyun mov CNT, BUFFER_END 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun lea K_XMM_AR(%rip), K_BASE 88*4882a593Smuzhiyun xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun SHA1_PIPELINED_MAIN_BODY 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun # cleanup workspace 93*4882a593Smuzhiyun mov $8, %ecx 94*4882a593Smuzhiyun mov %rsp, %rdi 95*4882a593Smuzhiyun xor %eax, %eax 96*4882a593Smuzhiyun rep stosq 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun mov %rbp, %rsp # deallocate workspace 99*4882a593Smuzhiyun pop %rbp 100*4882a593Smuzhiyun pop %r12 101*4882a593Smuzhiyun pop %rbx 102*4882a593Smuzhiyun RET 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun SYM_FUNC_END(\name) 105*4882a593Smuzhiyun.endm 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun/* 108*4882a593Smuzhiyun * This macro implements 80 rounds of SHA-1 for one 64-byte block 109*4882a593Smuzhiyun */ 110*4882a593Smuzhiyun.macro SHA1_PIPELINED_MAIN_BODY 111*4882a593Smuzhiyun INIT_REGALLOC 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun mov (HASH_PTR), A 114*4882a593Smuzhiyun mov 4(HASH_PTR), B 115*4882a593Smuzhiyun mov 8(HASH_PTR), C 116*4882a593Smuzhiyun mov 12(HASH_PTR), D 117*4882a593Smuzhiyun mov 16(HASH_PTR), E 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun .set i, 0 120*4882a593Smuzhiyun .rept W_PRECALC_AHEAD 121*4882a593Smuzhiyun W_PRECALC i 122*4882a593Smuzhiyun .set i, (i+1) 123*4882a593Smuzhiyun .endr 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun.align 4 126*4882a593Smuzhiyun1: 127*4882a593Smuzhiyun RR F1,A,B,C,D,E,0 128*4882a593Smuzhiyun RR F1,D,E,A,B,C,2 129*4882a593Smuzhiyun RR F1,B,C,D,E,A,4 130*4882a593Smuzhiyun RR F1,E,A,B,C,D,6 131*4882a593Smuzhiyun RR F1,C,D,E,A,B,8 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun RR F1,A,B,C,D,E,10 134*4882a593Smuzhiyun RR F1,D,E,A,B,C,12 135*4882a593Smuzhiyun RR F1,B,C,D,E,A,14 136*4882a593Smuzhiyun RR F1,E,A,B,C,D,16 137*4882a593Smuzhiyun RR F1,C,D,E,A,B,18 138*4882a593Smuzhiyun 139*4882a593Smuzhiyun RR F2,A,B,C,D,E,20 140*4882a593Smuzhiyun RR F2,D,E,A,B,C,22 141*4882a593Smuzhiyun RR F2,B,C,D,E,A,24 142*4882a593Smuzhiyun RR F2,E,A,B,C,D,26 143*4882a593Smuzhiyun RR F2,C,D,E,A,B,28 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun RR F2,A,B,C,D,E,30 146*4882a593Smuzhiyun RR F2,D,E,A,B,C,32 147*4882a593Smuzhiyun RR F2,B,C,D,E,A,34 148*4882a593Smuzhiyun RR F2,E,A,B,C,D,36 149*4882a593Smuzhiyun RR F2,C,D,E,A,B,38 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun RR F3,A,B,C,D,E,40 152*4882a593Smuzhiyun RR F3,D,E,A,B,C,42 153*4882a593Smuzhiyun RR F3,B,C,D,E,A,44 154*4882a593Smuzhiyun RR F3,E,A,B,C,D,46 155*4882a593Smuzhiyun RR F3,C,D,E,A,B,48 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun RR F3,A,B,C,D,E,50 158*4882a593Smuzhiyun RR F3,D,E,A,B,C,52 159*4882a593Smuzhiyun RR F3,B,C,D,E,A,54 160*4882a593Smuzhiyun RR F3,E,A,B,C,D,56 161*4882a593Smuzhiyun RR F3,C,D,E,A,B,58 162*4882a593Smuzhiyun 163*4882a593Smuzhiyun add $64, BUFFER_PTR # move to the next 64-byte block 164*4882a593Smuzhiyun cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 165*4882a593Smuzhiyun cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun RR F4,A,B,C,D,E,60 168*4882a593Smuzhiyun RR F4,D,E,A,B,C,62 169*4882a593Smuzhiyun RR F4,B,C,D,E,A,64 170*4882a593Smuzhiyun RR F4,E,A,B,C,D,66 171*4882a593Smuzhiyun RR F4,C,D,E,A,B,68 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun RR F4,A,B,C,D,E,70 174*4882a593Smuzhiyun RR F4,D,E,A,B,C,72 175*4882a593Smuzhiyun RR F4,B,C,D,E,A,74 176*4882a593Smuzhiyun RR F4,E,A,B,C,D,76 177*4882a593Smuzhiyun RR F4,C,D,E,A,B,78 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun UPDATE_HASH (HASH_PTR), A 180*4882a593Smuzhiyun UPDATE_HASH 4(HASH_PTR), B 181*4882a593Smuzhiyun UPDATE_HASH 8(HASH_PTR), C 182*4882a593Smuzhiyun UPDATE_HASH 12(HASH_PTR), D 183*4882a593Smuzhiyun UPDATE_HASH 16(HASH_PTR), E 184*4882a593Smuzhiyun 185*4882a593Smuzhiyun RESTORE_RENAMED_REGS 186*4882a593Smuzhiyun cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 187*4882a593Smuzhiyun jne 1b 188*4882a593Smuzhiyun.endm 189*4882a593Smuzhiyun 190*4882a593Smuzhiyun.macro INIT_REGALLOC 191*4882a593Smuzhiyun .set A, REG_A 192*4882a593Smuzhiyun .set B, REG_B 193*4882a593Smuzhiyun .set C, REG_C 194*4882a593Smuzhiyun .set D, REG_D 195*4882a593Smuzhiyun .set E, REG_E 196*4882a593Smuzhiyun .set T1, REG_T1 197*4882a593Smuzhiyun .set T2, REG_T2 198*4882a593Smuzhiyun.endm 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun.macro RESTORE_RENAMED_REGS 201*4882a593Smuzhiyun # order is important (REG_C is where it should be) 202*4882a593Smuzhiyun mov B, REG_B 203*4882a593Smuzhiyun mov D, REG_D 204*4882a593Smuzhiyun mov A, REG_A 205*4882a593Smuzhiyun mov E, REG_E 206*4882a593Smuzhiyun.endm 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun.macro SWAP_REG_NAMES a, b 209*4882a593Smuzhiyun .set _T, \a 210*4882a593Smuzhiyun .set \a, \b 211*4882a593Smuzhiyun .set \b, _T 212*4882a593Smuzhiyun.endm 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun.macro F1 b, c, d 215*4882a593Smuzhiyun mov \c, T1 216*4882a593Smuzhiyun SWAP_REG_NAMES \c, T1 217*4882a593Smuzhiyun xor \d, T1 218*4882a593Smuzhiyun and \b, T1 219*4882a593Smuzhiyun xor \d, T1 220*4882a593Smuzhiyun.endm 221*4882a593Smuzhiyun 222*4882a593Smuzhiyun.macro F2 b, c, d 223*4882a593Smuzhiyun mov \d, T1 224*4882a593Smuzhiyun SWAP_REG_NAMES \d, T1 225*4882a593Smuzhiyun xor \c, T1 226*4882a593Smuzhiyun xor \b, T1 227*4882a593Smuzhiyun.endm 228*4882a593Smuzhiyun 229*4882a593Smuzhiyun.macro F3 b, c ,d 230*4882a593Smuzhiyun mov \c, T1 231*4882a593Smuzhiyun SWAP_REG_NAMES \c, T1 232*4882a593Smuzhiyun mov \b, T2 233*4882a593Smuzhiyun or \b, T1 234*4882a593Smuzhiyun and \c, T2 235*4882a593Smuzhiyun and \d, T1 236*4882a593Smuzhiyun or T2, T1 237*4882a593Smuzhiyun.endm 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun.macro F4 b, c, d 240*4882a593Smuzhiyun F2 \b, \c, \d 241*4882a593Smuzhiyun.endm 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun.macro UPDATE_HASH hash, val 244*4882a593Smuzhiyun add \hash, \val 245*4882a593Smuzhiyun mov \val, \hash 246*4882a593Smuzhiyun.endm 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun/* 249*4882a593Smuzhiyun * RR does two rounds of SHA-1 back to back with W[] pre-calc 250*4882a593Smuzhiyun * t1 = F(b, c, d); e += w(i) 251*4882a593Smuzhiyun * e += t1; b <<= 30; d += w(i+1); 252*4882a593Smuzhiyun * t1 = F(a, b, c); 253*4882a593Smuzhiyun * d += t1; a <<= 5; 254*4882a593Smuzhiyun * e += a; 255*4882a593Smuzhiyun * t1 = e; a >>= 7; 256*4882a593Smuzhiyun * t1 <<= 5; 257*4882a593Smuzhiyun * d += t1; 258*4882a593Smuzhiyun */ 259*4882a593Smuzhiyun.macro RR F, a, b, c, d, e, round 260*4882a593Smuzhiyun add WK(\round), \e 261*4882a593Smuzhiyun \F \b, \c, \d # t1 = F(b, c, d); 262*4882a593Smuzhiyun W_PRECALC (\round + W_PRECALC_AHEAD) 263*4882a593Smuzhiyun rol $30, \b 264*4882a593Smuzhiyun add T1, \e 265*4882a593Smuzhiyun add WK(\round + 1), \d 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun \F \a, \b, \c 268*4882a593Smuzhiyun W_PRECALC (\round + W_PRECALC_AHEAD + 1) 269*4882a593Smuzhiyun rol $5, \a 270*4882a593Smuzhiyun add \a, \e 271*4882a593Smuzhiyun add T1, \d 272*4882a593Smuzhiyun ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun mov \e, T1 275*4882a593Smuzhiyun SWAP_REG_NAMES \e, T1 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun rol $5, T1 278*4882a593Smuzhiyun add T1, \d 279*4882a593Smuzhiyun 280*4882a593Smuzhiyun # write: \a, \b 281*4882a593Smuzhiyun # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 282*4882a593Smuzhiyun.endm 283*4882a593Smuzhiyun 284*4882a593Smuzhiyun.macro W_PRECALC r 285*4882a593Smuzhiyun .set i, \r 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun .if (i < 20) 288*4882a593Smuzhiyun .set K_XMM, 0 289*4882a593Smuzhiyun .elseif (i < 40) 290*4882a593Smuzhiyun .set K_XMM, 16 291*4882a593Smuzhiyun .elseif (i < 60) 292*4882a593Smuzhiyun .set K_XMM, 32 293*4882a593Smuzhiyun .elseif (i < 80) 294*4882a593Smuzhiyun .set K_XMM, 48 295*4882a593Smuzhiyun .endif 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 298*4882a593Smuzhiyun .set i, ((\r) % 80) # pre-compute for the next iteration 299*4882a593Smuzhiyun .if (i == 0) 300*4882a593Smuzhiyun W_PRECALC_RESET 301*4882a593Smuzhiyun .endif 302*4882a593Smuzhiyun W_PRECALC_00_15 303*4882a593Smuzhiyun .elseif (i<32) 304*4882a593Smuzhiyun W_PRECALC_16_31 305*4882a593Smuzhiyun .elseif (i < 80) // rounds 32-79 306*4882a593Smuzhiyun W_PRECALC_32_79 307*4882a593Smuzhiyun .endif 308*4882a593Smuzhiyun.endm 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun.macro W_PRECALC_RESET 311*4882a593Smuzhiyun .set W, W0 312*4882a593Smuzhiyun .set W_minus_04, W4 313*4882a593Smuzhiyun .set W_minus_08, W8 314*4882a593Smuzhiyun .set W_minus_12, W12 315*4882a593Smuzhiyun .set W_minus_16, W16 316*4882a593Smuzhiyun .set W_minus_20, W20 317*4882a593Smuzhiyun .set W_minus_24, W24 318*4882a593Smuzhiyun .set W_minus_28, W28 319*4882a593Smuzhiyun .set W_minus_32, W 320*4882a593Smuzhiyun.endm 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun.macro W_PRECALC_ROTATE 323*4882a593Smuzhiyun .set W_minus_32, W_minus_28 324*4882a593Smuzhiyun .set W_minus_28, W_minus_24 325*4882a593Smuzhiyun .set W_minus_24, W_minus_20 326*4882a593Smuzhiyun .set W_minus_20, W_minus_16 327*4882a593Smuzhiyun .set W_minus_16, W_minus_12 328*4882a593Smuzhiyun .set W_minus_12, W_minus_08 329*4882a593Smuzhiyun .set W_minus_08, W_minus_04 330*4882a593Smuzhiyun .set W_minus_04, W 331*4882a593Smuzhiyun .set W, W_minus_32 332*4882a593Smuzhiyun.endm 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun.macro W_PRECALC_SSSE3 335*4882a593Smuzhiyun 336*4882a593Smuzhiyun.macro W_PRECALC_00_15 337*4882a593Smuzhiyun W_PRECALC_00_15_SSSE3 338*4882a593Smuzhiyun.endm 339*4882a593Smuzhiyun.macro W_PRECALC_16_31 340*4882a593Smuzhiyun W_PRECALC_16_31_SSSE3 341*4882a593Smuzhiyun.endm 342*4882a593Smuzhiyun.macro W_PRECALC_32_79 343*4882a593Smuzhiyun W_PRECALC_32_79_SSSE3 344*4882a593Smuzhiyun.endm 345*4882a593Smuzhiyun 346*4882a593Smuzhiyun/* message scheduling pre-compute for rounds 0-15 */ 347*4882a593Smuzhiyun.macro W_PRECALC_00_15_SSSE3 348*4882a593Smuzhiyun .if ((i & 3) == 0) 349*4882a593Smuzhiyun movdqu (i*4)(BUFFER_PTR), W_TMP1 350*4882a593Smuzhiyun .elseif ((i & 3) == 1) 351*4882a593Smuzhiyun pshufb XMM_SHUFB_BSWAP, W_TMP1 352*4882a593Smuzhiyun movdqa W_TMP1, W 353*4882a593Smuzhiyun .elseif ((i & 3) == 2) 354*4882a593Smuzhiyun paddd (K_BASE), W_TMP1 355*4882a593Smuzhiyun .elseif ((i & 3) == 3) 356*4882a593Smuzhiyun movdqa W_TMP1, WK(i&~3) 357*4882a593Smuzhiyun W_PRECALC_ROTATE 358*4882a593Smuzhiyun .endif 359*4882a593Smuzhiyun.endm 360*4882a593Smuzhiyun 361*4882a593Smuzhiyun/* message scheduling pre-compute for rounds 16-31 362*4882a593Smuzhiyun * 363*4882a593Smuzhiyun * - calculating last 32 w[i] values in 8 XMM registers 364*4882a593Smuzhiyun * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 365*4882a593Smuzhiyun * instruction 366*4882a593Smuzhiyun * 367*4882a593Smuzhiyun * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 368*4882a593Smuzhiyun * dependency, but improves for 32-79 369*4882a593Smuzhiyun */ 370*4882a593Smuzhiyun.macro W_PRECALC_16_31_SSSE3 371*4882a593Smuzhiyun # blended scheduling of vector and scalar instruction streams, one 4-wide 372*4882a593Smuzhiyun # vector iteration / 4 scalar rounds 373*4882a593Smuzhiyun .if ((i & 3) == 0) 374*4882a593Smuzhiyun movdqa W_minus_12, W 375*4882a593Smuzhiyun palignr $8, W_minus_16, W # w[i-14] 376*4882a593Smuzhiyun movdqa W_minus_04, W_TMP1 377*4882a593Smuzhiyun psrldq $4, W_TMP1 # w[i-3] 378*4882a593Smuzhiyun pxor W_minus_08, W 379*4882a593Smuzhiyun .elseif ((i & 3) == 1) 380*4882a593Smuzhiyun pxor W_minus_16, W_TMP1 381*4882a593Smuzhiyun pxor W_TMP1, W 382*4882a593Smuzhiyun movdqa W, W_TMP2 383*4882a593Smuzhiyun movdqa W, W_TMP1 384*4882a593Smuzhiyun pslldq $12, W_TMP2 385*4882a593Smuzhiyun .elseif ((i & 3) == 2) 386*4882a593Smuzhiyun psrld $31, W 387*4882a593Smuzhiyun pslld $1, W_TMP1 388*4882a593Smuzhiyun por W, W_TMP1 389*4882a593Smuzhiyun movdqa W_TMP2, W 390*4882a593Smuzhiyun psrld $30, W_TMP2 391*4882a593Smuzhiyun pslld $2, W 392*4882a593Smuzhiyun .elseif ((i & 3) == 3) 393*4882a593Smuzhiyun pxor W, W_TMP1 394*4882a593Smuzhiyun pxor W_TMP2, W_TMP1 395*4882a593Smuzhiyun movdqa W_TMP1, W 396*4882a593Smuzhiyun paddd K_XMM(K_BASE), W_TMP1 397*4882a593Smuzhiyun movdqa W_TMP1, WK(i&~3) 398*4882a593Smuzhiyun W_PRECALC_ROTATE 399*4882a593Smuzhiyun .endif 400*4882a593Smuzhiyun.endm 401*4882a593Smuzhiyun 402*4882a593Smuzhiyun/* message scheduling pre-compute for rounds 32-79 403*4882a593Smuzhiyun * 404*4882a593Smuzhiyun * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 405*4882a593Smuzhiyun * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 406*4882a593Smuzhiyun * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 407*4882a593Smuzhiyun */ 408*4882a593Smuzhiyun.macro W_PRECALC_32_79_SSSE3 409*4882a593Smuzhiyun .if ((i & 3) == 0) 410*4882a593Smuzhiyun movdqa W_minus_04, W_TMP1 411*4882a593Smuzhiyun pxor W_minus_28, W # W is W_minus_32 before xor 412*4882a593Smuzhiyun palignr $8, W_minus_08, W_TMP1 413*4882a593Smuzhiyun .elseif ((i & 3) == 1) 414*4882a593Smuzhiyun pxor W_minus_16, W 415*4882a593Smuzhiyun pxor W_TMP1, W 416*4882a593Smuzhiyun movdqa W, W_TMP1 417*4882a593Smuzhiyun .elseif ((i & 3) == 2) 418*4882a593Smuzhiyun psrld $30, W 419*4882a593Smuzhiyun pslld $2, W_TMP1 420*4882a593Smuzhiyun por W, W_TMP1 421*4882a593Smuzhiyun .elseif ((i & 3) == 3) 422*4882a593Smuzhiyun movdqa W_TMP1, W 423*4882a593Smuzhiyun paddd K_XMM(K_BASE), W_TMP1 424*4882a593Smuzhiyun movdqa W_TMP1, WK(i&~3) 425*4882a593Smuzhiyun W_PRECALC_ROTATE 426*4882a593Smuzhiyun .endif 427*4882a593Smuzhiyun.endm 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun.endm // W_PRECALC_SSSE3 430*4882a593Smuzhiyun 431*4882a593Smuzhiyun 432*4882a593Smuzhiyun#define K1 0x5a827999 433*4882a593Smuzhiyun#define K2 0x6ed9eba1 434*4882a593Smuzhiyun#define K3 0x8f1bbcdc 435*4882a593Smuzhiyun#define K4 0xca62c1d6 436*4882a593Smuzhiyun 437*4882a593Smuzhiyun.section .rodata 438*4882a593Smuzhiyun.align 16 439*4882a593Smuzhiyun 440*4882a593SmuzhiyunK_XMM_AR: 441*4882a593Smuzhiyun .long K1, K1, K1, K1 442*4882a593Smuzhiyun .long K2, K2, K2, K2 443*4882a593Smuzhiyun .long K3, K3, K3, K3 444*4882a593Smuzhiyun .long K4, K4, K4, K4 445*4882a593Smuzhiyun 446*4882a593SmuzhiyunBSWAP_SHUFB_CTL: 447*4882a593Smuzhiyun .long 0x00010203 448*4882a593Smuzhiyun .long 0x04050607 449*4882a593Smuzhiyun .long 0x08090a0b 450*4882a593Smuzhiyun .long 0x0c0d0e0f 451*4882a593Smuzhiyun 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun.section .text 454*4882a593Smuzhiyun 455*4882a593SmuzhiyunW_PRECALC_SSSE3 456*4882a593Smuzhiyun.macro xmm_mov a, b 457*4882a593Smuzhiyun movdqu \a,\b 458*4882a593Smuzhiyun.endm 459*4882a593Smuzhiyun 460*4882a593Smuzhiyun/* 461*4882a593Smuzhiyun * SSSE3 optimized implementation: 462*4882a593Smuzhiyun * 463*4882a593Smuzhiyun * extern "C" void sha1_transform_ssse3(struct sha1_state *state, 464*4882a593Smuzhiyun * const u8 *data, int blocks); 465*4882a593Smuzhiyun * 466*4882a593Smuzhiyun * Note that struct sha1_state is assumed to begin with u32 state[5]. 467*4882a593Smuzhiyun */ 468*4882a593SmuzhiyunSHA1_VECTOR_ASM sha1_transform_ssse3 469*4882a593Smuzhiyun 470*4882a593Smuzhiyun.macro W_PRECALC_AVX 471*4882a593Smuzhiyun 472*4882a593Smuzhiyun.purgem W_PRECALC_00_15 473*4882a593Smuzhiyun.macro W_PRECALC_00_15 474*4882a593Smuzhiyun W_PRECALC_00_15_AVX 475*4882a593Smuzhiyun.endm 476*4882a593Smuzhiyun.purgem W_PRECALC_16_31 477*4882a593Smuzhiyun.macro W_PRECALC_16_31 478*4882a593Smuzhiyun W_PRECALC_16_31_AVX 479*4882a593Smuzhiyun.endm 480*4882a593Smuzhiyun.purgem W_PRECALC_32_79 481*4882a593Smuzhiyun.macro W_PRECALC_32_79 482*4882a593Smuzhiyun W_PRECALC_32_79_AVX 483*4882a593Smuzhiyun.endm 484*4882a593Smuzhiyun 485*4882a593Smuzhiyun.macro W_PRECALC_00_15_AVX 486*4882a593Smuzhiyun .if ((i & 3) == 0) 487*4882a593Smuzhiyun vmovdqu (i*4)(BUFFER_PTR), W_TMP1 488*4882a593Smuzhiyun .elseif ((i & 3) == 1) 489*4882a593Smuzhiyun vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 490*4882a593Smuzhiyun .elseif ((i & 3) == 2) 491*4882a593Smuzhiyun vpaddd (K_BASE), W, W_TMP1 492*4882a593Smuzhiyun .elseif ((i & 3) == 3) 493*4882a593Smuzhiyun vmovdqa W_TMP1, WK(i&~3) 494*4882a593Smuzhiyun W_PRECALC_ROTATE 495*4882a593Smuzhiyun .endif 496*4882a593Smuzhiyun.endm 497*4882a593Smuzhiyun 498*4882a593Smuzhiyun.macro W_PRECALC_16_31_AVX 499*4882a593Smuzhiyun .if ((i & 3) == 0) 500*4882a593Smuzhiyun vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 501*4882a593Smuzhiyun vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 502*4882a593Smuzhiyun vpxor W_minus_08, W, W 503*4882a593Smuzhiyun vpxor W_minus_16, W_TMP1, W_TMP1 504*4882a593Smuzhiyun .elseif ((i & 3) == 1) 505*4882a593Smuzhiyun vpxor W_TMP1, W, W 506*4882a593Smuzhiyun vpslldq $12, W, W_TMP2 507*4882a593Smuzhiyun vpslld $1, W, W_TMP1 508*4882a593Smuzhiyun .elseif ((i & 3) == 2) 509*4882a593Smuzhiyun vpsrld $31, W, W 510*4882a593Smuzhiyun vpor W, W_TMP1, W_TMP1 511*4882a593Smuzhiyun vpslld $2, W_TMP2, W 512*4882a593Smuzhiyun vpsrld $30, W_TMP2, W_TMP2 513*4882a593Smuzhiyun .elseif ((i & 3) == 3) 514*4882a593Smuzhiyun vpxor W, W_TMP1, W_TMP1 515*4882a593Smuzhiyun vpxor W_TMP2, W_TMP1, W 516*4882a593Smuzhiyun vpaddd K_XMM(K_BASE), W, W_TMP1 517*4882a593Smuzhiyun vmovdqu W_TMP1, WK(i&~3) 518*4882a593Smuzhiyun W_PRECALC_ROTATE 519*4882a593Smuzhiyun .endif 520*4882a593Smuzhiyun.endm 521*4882a593Smuzhiyun 522*4882a593Smuzhiyun.macro W_PRECALC_32_79_AVX 523*4882a593Smuzhiyun .if ((i & 3) == 0) 524*4882a593Smuzhiyun vpalignr $8, W_minus_08, W_minus_04, W_TMP1 525*4882a593Smuzhiyun vpxor W_minus_28, W, W # W is W_minus_32 before xor 526*4882a593Smuzhiyun .elseif ((i & 3) == 1) 527*4882a593Smuzhiyun vpxor W_minus_16, W_TMP1, W_TMP1 528*4882a593Smuzhiyun vpxor W_TMP1, W, W 529*4882a593Smuzhiyun .elseif ((i & 3) == 2) 530*4882a593Smuzhiyun vpslld $2, W, W_TMP1 531*4882a593Smuzhiyun vpsrld $30, W, W 532*4882a593Smuzhiyun vpor W, W_TMP1, W 533*4882a593Smuzhiyun .elseif ((i & 3) == 3) 534*4882a593Smuzhiyun vpaddd K_XMM(K_BASE), W, W_TMP1 535*4882a593Smuzhiyun vmovdqu W_TMP1, WK(i&~3) 536*4882a593Smuzhiyun W_PRECALC_ROTATE 537*4882a593Smuzhiyun .endif 538*4882a593Smuzhiyun.endm 539*4882a593Smuzhiyun 540*4882a593Smuzhiyun.endm // W_PRECALC_AVX 541*4882a593Smuzhiyun 542*4882a593SmuzhiyunW_PRECALC_AVX 543*4882a593Smuzhiyun.purgem xmm_mov 544*4882a593Smuzhiyun.macro xmm_mov a, b 545*4882a593Smuzhiyun vmovdqu \a,\b 546*4882a593Smuzhiyun.endm 547*4882a593Smuzhiyun 548*4882a593Smuzhiyun 549*4882a593Smuzhiyun/* AVX optimized implementation: 550*4882a593Smuzhiyun * extern "C" void sha1_transform_avx(struct sha1_state *state, 551*4882a593Smuzhiyun * const u8 *data, int blocks); 552*4882a593Smuzhiyun */ 553*4882a593SmuzhiyunSHA1_VECTOR_ASM sha1_transform_avx 554