1*4882a593Smuzhiyun######################################################################## 2*4882a593Smuzhiyun# Implement fast SHA-256 with AVX1 instructions. (x86_64) 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# Copyright (C) 2013 Intel Corporation. 5*4882a593Smuzhiyun# 6*4882a593Smuzhiyun# Authors: 7*4882a593Smuzhiyun# James Guilford <james.guilford@intel.com> 8*4882a593Smuzhiyun# Kirk Yap <kirk.s.yap@intel.com> 9*4882a593Smuzhiyun# Tim Chen <tim.c.chen@linux.intel.com> 10*4882a593Smuzhiyun# 11*4882a593Smuzhiyun# This software is available to you under a choice of one of two 12*4882a593Smuzhiyun# licenses. You may choose to be licensed under the terms of the GNU 13*4882a593Smuzhiyun# General Public License (GPL) Version 2, available from the file 14*4882a593Smuzhiyun# COPYING in the main directory of this source tree, or the 15*4882a593Smuzhiyun# OpenIB.org BSD license below: 16*4882a593Smuzhiyun# 17*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or 18*4882a593Smuzhiyun# without modification, are permitted provided that the following 19*4882a593Smuzhiyun# conditions are met: 20*4882a593Smuzhiyun# 21*4882a593Smuzhiyun# - Redistributions of source code must retain the above 22*4882a593Smuzhiyun# copyright notice, this list of conditions and the following 23*4882a593Smuzhiyun# disclaimer. 24*4882a593Smuzhiyun# 25*4882a593Smuzhiyun# - Redistributions in binary form must reproduce the above 26*4882a593Smuzhiyun# copyright notice, this list of conditions and the following 27*4882a593Smuzhiyun# disclaimer in the documentation and/or other materials 28*4882a593Smuzhiyun# provided with the distribution. 29*4882a593Smuzhiyun# 30*4882a593Smuzhiyun# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31*4882a593Smuzhiyun# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32*4882a593Smuzhiyun# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33*4882a593Smuzhiyun# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34*4882a593Smuzhiyun# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35*4882a593Smuzhiyun# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36*4882a593Smuzhiyun# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37*4882a593Smuzhiyun# SOFTWARE. 38*4882a593Smuzhiyun######################################################################## 39*4882a593Smuzhiyun# 40*4882a593Smuzhiyun# This code is described in an Intel White-Paper: 41*4882a593Smuzhiyun# "Fast SHA-256 Implementations on Intel Architecture Processors" 42*4882a593Smuzhiyun# 43*4882a593Smuzhiyun# To find it, surf to http://www.intel.com/p/en_US/embedded 44*4882a593Smuzhiyun# and search for that title. 45*4882a593Smuzhiyun# 46*4882a593Smuzhiyun######################################################################## 47*4882a593Smuzhiyun# This code schedules 1 block at a time, with 4 lanes per block 48*4882a593Smuzhiyun######################################################################## 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun#include <linux/linkage.h> 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun## assume buffers not aligned 53*4882a593Smuzhiyun#define VMOVDQ vmovdqu 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun################################ Define Macros 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun# addm [mem], reg 58*4882a593Smuzhiyun# Add reg to mem using reg-mem add and store 59*4882a593Smuzhiyun.macro addm p1 p2 60*4882a593Smuzhiyun add \p1, \p2 61*4882a593Smuzhiyun mov \p2, \p1 62*4882a593Smuzhiyun.endm 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun.macro MY_ROR p1 p2 66*4882a593Smuzhiyun shld $(32-(\p1)), \p2, \p2 67*4882a593Smuzhiyun.endm 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun################################ 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 72*4882a593Smuzhiyun# Load xmm with mem and byte swap each dword 73*4882a593Smuzhiyun.macro COPY_XMM_AND_BSWAP p1 p2 p3 74*4882a593Smuzhiyun VMOVDQ \p2, \p1 75*4882a593Smuzhiyun vpshufb \p3, \p1, \p1 76*4882a593Smuzhiyun.endm 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun################################ 79*4882a593Smuzhiyun 80*4882a593SmuzhiyunX0 = %xmm4 81*4882a593SmuzhiyunX1 = %xmm5 82*4882a593SmuzhiyunX2 = %xmm6 83*4882a593SmuzhiyunX3 = %xmm7 84*4882a593Smuzhiyun 85*4882a593SmuzhiyunXTMP0 = %xmm0 86*4882a593SmuzhiyunXTMP1 = %xmm1 87*4882a593SmuzhiyunXTMP2 = %xmm2 88*4882a593SmuzhiyunXTMP3 = %xmm3 89*4882a593SmuzhiyunXTMP4 = %xmm8 90*4882a593SmuzhiyunXFER = %xmm9 91*4882a593SmuzhiyunXTMP5 = %xmm11 92*4882a593Smuzhiyun 93*4882a593SmuzhiyunSHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 94*4882a593SmuzhiyunSHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 95*4882a593SmuzhiyunBYTE_FLIP_MASK = %xmm13 96*4882a593Smuzhiyun 97*4882a593SmuzhiyunNUM_BLKS = %rdx # 3rd arg 98*4882a593SmuzhiyunINP = %rsi # 2nd arg 99*4882a593SmuzhiyunCTX = %rdi # 1st arg 100*4882a593Smuzhiyun 101*4882a593SmuzhiyunSRND = %rsi # clobbers INP 102*4882a593Smuzhiyunc = %ecx 103*4882a593Smuzhiyund = %r8d 104*4882a593Smuzhiyune = %edx 105*4882a593SmuzhiyunTBL = %r12 106*4882a593Smuzhiyuna = %eax 107*4882a593Smuzhiyunb = %ebx 108*4882a593Smuzhiyun 109*4882a593Smuzhiyunf = %r9d 110*4882a593Smuzhiyung = %r10d 111*4882a593Smuzhiyunh = %r11d 112*4882a593Smuzhiyun 113*4882a593Smuzhiyuny0 = %r13d 114*4882a593Smuzhiyuny1 = %r14d 115*4882a593Smuzhiyuny2 = %r15d 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun_INP_END_SIZE = 8 119*4882a593Smuzhiyun_INP_SIZE = 8 120*4882a593Smuzhiyun_XFER_SIZE = 16 121*4882a593Smuzhiyun_XMM_SAVE_SIZE = 0 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun_INP_END = 0 124*4882a593Smuzhiyun_INP = _INP_END + _INP_END_SIZE 125*4882a593Smuzhiyun_XFER = _INP + _INP_SIZE 126*4882a593Smuzhiyun_XMM_SAVE = _XFER + _XFER_SIZE 127*4882a593SmuzhiyunSTACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun# rotate_Xs 130*4882a593Smuzhiyun# Rotate values of symbols X0...X3 131*4882a593Smuzhiyun.macro rotate_Xs 132*4882a593SmuzhiyunX_ = X0 133*4882a593SmuzhiyunX0 = X1 134*4882a593SmuzhiyunX1 = X2 135*4882a593SmuzhiyunX2 = X3 136*4882a593SmuzhiyunX3 = X_ 137*4882a593Smuzhiyun.endm 138*4882a593Smuzhiyun 139*4882a593Smuzhiyun# ROTATE_ARGS 140*4882a593Smuzhiyun# Rotate values of symbols a...h 141*4882a593Smuzhiyun.macro ROTATE_ARGS 142*4882a593SmuzhiyunTMP_ = h 143*4882a593Smuzhiyunh = g 144*4882a593Smuzhiyung = f 145*4882a593Smuzhiyunf = e 146*4882a593Smuzhiyune = d 147*4882a593Smuzhiyund = c 148*4882a593Smuzhiyunc = b 149*4882a593Smuzhiyunb = a 150*4882a593Smuzhiyuna = TMP_ 151*4882a593Smuzhiyun.endm 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun.macro FOUR_ROUNDS_AND_SCHED 154*4882a593Smuzhiyun ## compute s0 four at a time and s1 two at a time 155*4882a593Smuzhiyun ## compute W[-16] + W[-7] 4 at a time 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun mov e, y0 # y0 = e 158*4882a593Smuzhiyun MY_ROR (25-11), y0 # y0 = e >> (25-11) 159*4882a593Smuzhiyun mov a, y1 # y1 = a 160*4882a593Smuzhiyun vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 161*4882a593Smuzhiyun MY_ROR (22-13), y1 # y1 = a >> (22-13) 162*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 163*4882a593Smuzhiyun mov f, y2 # y2 = f 164*4882a593Smuzhiyun MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 165*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 166*4882a593Smuzhiyun xor g, y2 # y2 = f^g 167*4882a593Smuzhiyun vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] 168*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 169*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 170*4882a593Smuzhiyun MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 171*4882a593Smuzhiyun ## compute s0 172*4882a593Smuzhiyun vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 173*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 174*4882a593Smuzhiyun MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 175*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 176*4882a593Smuzhiyun MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 177*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 178*4882a593Smuzhiyun add _XFER(%rsp), y2 # y2 = k + w + S1 + CH 179*4882a593Smuzhiyun mov a, y0 # y0 = a 180*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 181*4882a593Smuzhiyun mov a, y2 # y2 = a 182*4882a593Smuzhiyun vpsrld $7, XTMP1, XTMP2 183*4882a593Smuzhiyun or c, y0 # y0 = a|c 184*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 185*4882a593Smuzhiyun and c, y2 # y2 = a&c 186*4882a593Smuzhiyun vpslld $(32-7), XTMP1, XTMP3 187*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 188*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 189*4882a593Smuzhiyun vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 190*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 191*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 192*4882a593Smuzhiyun ROTATE_ARGS 193*4882a593Smuzhiyun mov e, y0 # y0 = e 194*4882a593Smuzhiyun mov a, y1 # y1 = a 195*4882a593Smuzhiyun MY_ROR (25-11), y0 # y0 = e >> (25-11) 196*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 197*4882a593Smuzhiyun mov f, y2 # y2 = f 198*4882a593Smuzhiyun MY_ROR (22-13), y1 # y1 = a >> (22-13) 199*4882a593Smuzhiyun vpsrld $18, XTMP1, XTMP2 # 200*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 201*4882a593Smuzhiyun MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 202*4882a593Smuzhiyun xor g, y2 # y2 = f^g 203*4882a593Smuzhiyun vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 204*4882a593Smuzhiyun MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 205*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 206*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 207*4882a593Smuzhiyun MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 208*4882a593Smuzhiyun vpslld $(32-18), XTMP1, XTMP1 209*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 210*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 211*4882a593Smuzhiyun vpxor XTMP1, XTMP3, XTMP3 # 212*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 213*4882a593Smuzhiyun add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 214*4882a593Smuzhiyun MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 215*4882a593Smuzhiyun vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 216*4882a593Smuzhiyun mov a, y0 # y0 = a 217*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 218*4882a593Smuzhiyun mov a, y2 # y2 = a 219*4882a593Smuzhiyun vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 220*4882a593Smuzhiyun or c, y0 # y0 = a|c 221*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 222*4882a593Smuzhiyun and c, y2 # y2 = a&c 223*4882a593Smuzhiyun ## compute low s1 224*4882a593Smuzhiyun vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 225*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 226*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 227*4882a593Smuzhiyun vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 228*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 229*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 230*4882a593Smuzhiyun ROTATE_ARGS 231*4882a593Smuzhiyun mov e, y0 # y0 = e 232*4882a593Smuzhiyun mov a, y1 # y1 = a 233*4882a593Smuzhiyun MY_ROR (25-11), y0 # y0 = e >> (25-11) 234*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 235*4882a593Smuzhiyun MY_ROR (22-13), y1 # y1 = a >> (22-13) 236*4882a593Smuzhiyun mov f, y2 # y2 = f 237*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 238*4882a593Smuzhiyun MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 239*4882a593Smuzhiyun vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 240*4882a593Smuzhiyun xor g, y2 # y2 = f^g 241*4882a593Smuzhiyun vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} 242*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 243*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 244*4882a593Smuzhiyun vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} 245*4882a593Smuzhiyun MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 246*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 247*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 248*4882a593Smuzhiyun MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 249*4882a593Smuzhiyun vpxor XTMP3, XTMP2, XTMP2 # 250*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 251*4882a593Smuzhiyun MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 252*4882a593Smuzhiyun add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 253*4882a593Smuzhiyun vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 254*4882a593Smuzhiyun mov a, y0 # y0 = a 255*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 256*4882a593Smuzhiyun mov a, y2 # y2 = a 257*4882a593Smuzhiyun vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 258*4882a593Smuzhiyun or c, y0 # y0 = a|c 259*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 260*4882a593Smuzhiyun and c, y2 # y2 = a&c 261*4882a593Smuzhiyun vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 262*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 263*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 264*4882a593Smuzhiyun ## compute high s1 265*4882a593Smuzhiyun vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 266*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 267*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 268*4882a593Smuzhiyun ROTATE_ARGS 269*4882a593Smuzhiyun mov e, y0 # y0 = e 270*4882a593Smuzhiyun MY_ROR (25-11), y0 # y0 = e >> (25-11) 271*4882a593Smuzhiyun mov a, y1 # y1 = a 272*4882a593Smuzhiyun MY_ROR (22-13), y1 # y1 = a >> (22-13) 273*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 274*4882a593Smuzhiyun mov f, y2 # y2 = f 275*4882a593Smuzhiyun MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 276*4882a593Smuzhiyun vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 277*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 278*4882a593Smuzhiyun xor g, y2 # y2 = f^g 279*4882a593Smuzhiyun vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} 280*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 281*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 282*4882a593Smuzhiyun MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 283*4882a593Smuzhiyun vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} 284*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 285*4882a593Smuzhiyun MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 286*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 287*4882a593Smuzhiyun vpxor XTMP3, XTMP2, XTMP2 288*4882a593Smuzhiyun MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 289*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 290*4882a593Smuzhiyun add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 291*4882a593Smuzhiyun vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 292*4882a593Smuzhiyun mov a, y0 # y0 = a 293*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 294*4882a593Smuzhiyun mov a, y2 # y2 = a 295*4882a593Smuzhiyun vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 296*4882a593Smuzhiyun or c, y0 # y0 = a|c 297*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 298*4882a593Smuzhiyun and c, y2 # y2 = a&c 299*4882a593Smuzhiyun vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 300*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 301*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 302*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 303*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 304*4882a593Smuzhiyun ROTATE_ARGS 305*4882a593Smuzhiyun rotate_Xs 306*4882a593Smuzhiyun.endm 307*4882a593Smuzhiyun 308*4882a593Smuzhiyun## input is [rsp + _XFER + %1 * 4] 309*4882a593Smuzhiyun.macro DO_ROUND round 310*4882a593Smuzhiyun mov e, y0 # y0 = e 311*4882a593Smuzhiyun MY_ROR (25-11), y0 # y0 = e >> (25-11) 312*4882a593Smuzhiyun mov a, y1 # y1 = a 313*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 314*4882a593Smuzhiyun MY_ROR (22-13), y1 # y1 = a >> (22-13) 315*4882a593Smuzhiyun mov f, y2 # y2 = f 316*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 317*4882a593Smuzhiyun MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 318*4882a593Smuzhiyun xor g, y2 # y2 = f^g 319*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 320*4882a593Smuzhiyun MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 321*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 322*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 323*4882a593Smuzhiyun MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 324*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 325*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 326*4882a593Smuzhiyun MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 327*4882a593Smuzhiyun offset = \round * 4 + _XFER # 328*4882a593Smuzhiyun add offset(%rsp), y2 # y2 = k + w + S1 + CH 329*4882a593Smuzhiyun mov a, y0 # y0 = a 330*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 331*4882a593Smuzhiyun mov a, y2 # y2 = a 332*4882a593Smuzhiyun or c, y0 # y0 = a|c 333*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 334*4882a593Smuzhiyun and c, y2 # y2 = a&c 335*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 336*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 337*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 338*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 339*4882a593Smuzhiyun ROTATE_ARGS 340*4882a593Smuzhiyun.endm 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun######################################################################## 343*4882a593Smuzhiyun## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) 344*4882a593Smuzhiyun## arg 1 : pointer to state 345*4882a593Smuzhiyun## arg 2 : pointer to input data 346*4882a593Smuzhiyun## arg 3 : Num blocks 347*4882a593Smuzhiyun######################################################################## 348*4882a593Smuzhiyun.text 349*4882a593SmuzhiyunSYM_FUNC_START(sha256_transform_avx) 350*4882a593Smuzhiyun.align 32 351*4882a593Smuzhiyun pushq %rbx 352*4882a593Smuzhiyun pushq %r12 353*4882a593Smuzhiyun pushq %r13 354*4882a593Smuzhiyun pushq %r14 355*4882a593Smuzhiyun pushq %r15 356*4882a593Smuzhiyun pushq %rbp 357*4882a593Smuzhiyun movq %rsp, %rbp 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun subq $STACK_SIZE, %rsp # allocate stack space 360*4882a593Smuzhiyun and $~15, %rsp # align stack pointer 361*4882a593Smuzhiyun 362*4882a593Smuzhiyun shl $6, NUM_BLKS # convert to bytes 363*4882a593Smuzhiyun jz done_hash 364*4882a593Smuzhiyun add INP, NUM_BLKS # pointer to end of data 365*4882a593Smuzhiyun mov NUM_BLKS, _INP_END(%rsp) 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun ## load initial digest 368*4882a593Smuzhiyun mov 4*0(CTX), a 369*4882a593Smuzhiyun mov 4*1(CTX), b 370*4882a593Smuzhiyun mov 4*2(CTX), c 371*4882a593Smuzhiyun mov 4*3(CTX), d 372*4882a593Smuzhiyun mov 4*4(CTX), e 373*4882a593Smuzhiyun mov 4*5(CTX), f 374*4882a593Smuzhiyun mov 4*6(CTX), g 375*4882a593Smuzhiyun mov 4*7(CTX), h 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 378*4882a593Smuzhiyun vmovdqa _SHUF_00BA(%rip), SHUF_00BA 379*4882a593Smuzhiyun vmovdqa _SHUF_DC00(%rip), SHUF_DC00 380*4882a593Smuzhiyunloop0: 381*4882a593Smuzhiyun lea K256(%rip), TBL 382*4882a593Smuzhiyun 383*4882a593Smuzhiyun ## byte swap first 16 dwords 384*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 385*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 386*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 387*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 388*4882a593Smuzhiyun 389*4882a593Smuzhiyun mov INP, _INP(%rsp) 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun ## schedule 48 input dwords, by doing 3 rounds of 16 each 392*4882a593Smuzhiyun mov $3, SRND 393*4882a593Smuzhiyun.align 16 394*4882a593Smuzhiyunloop1: 395*4882a593Smuzhiyun vpaddd (TBL), X0, XFER 396*4882a593Smuzhiyun vmovdqa XFER, _XFER(%rsp) 397*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 398*4882a593Smuzhiyun 399*4882a593Smuzhiyun vpaddd 1*16(TBL), X0, XFER 400*4882a593Smuzhiyun vmovdqa XFER, _XFER(%rsp) 401*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 402*4882a593Smuzhiyun 403*4882a593Smuzhiyun vpaddd 2*16(TBL), X0, XFER 404*4882a593Smuzhiyun vmovdqa XFER, _XFER(%rsp) 405*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 406*4882a593Smuzhiyun 407*4882a593Smuzhiyun vpaddd 3*16(TBL), X0, XFER 408*4882a593Smuzhiyun vmovdqa XFER, _XFER(%rsp) 409*4882a593Smuzhiyun add $4*16, TBL 410*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 411*4882a593Smuzhiyun 412*4882a593Smuzhiyun sub $1, SRND 413*4882a593Smuzhiyun jne loop1 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun mov $2, SRND 416*4882a593Smuzhiyunloop2: 417*4882a593Smuzhiyun vpaddd (TBL), X0, XFER 418*4882a593Smuzhiyun vmovdqa XFER, _XFER(%rsp) 419*4882a593Smuzhiyun DO_ROUND 0 420*4882a593Smuzhiyun DO_ROUND 1 421*4882a593Smuzhiyun DO_ROUND 2 422*4882a593Smuzhiyun DO_ROUND 3 423*4882a593Smuzhiyun 424*4882a593Smuzhiyun vpaddd 1*16(TBL), X1, XFER 425*4882a593Smuzhiyun vmovdqa XFER, _XFER(%rsp) 426*4882a593Smuzhiyun add $2*16, TBL 427*4882a593Smuzhiyun DO_ROUND 0 428*4882a593Smuzhiyun DO_ROUND 1 429*4882a593Smuzhiyun DO_ROUND 2 430*4882a593Smuzhiyun DO_ROUND 3 431*4882a593Smuzhiyun 432*4882a593Smuzhiyun vmovdqa X2, X0 433*4882a593Smuzhiyun vmovdqa X3, X1 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun sub $1, SRND 436*4882a593Smuzhiyun jne loop2 437*4882a593Smuzhiyun 438*4882a593Smuzhiyun addm (4*0)(CTX),a 439*4882a593Smuzhiyun addm (4*1)(CTX),b 440*4882a593Smuzhiyun addm (4*2)(CTX),c 441*4882a593Smuzhiyun addm (4*3)(CTX),d 442*4882a593Smuzhiyun addm (4*4)(CTX),e 443*4882a593Smuzhiyun addm (4*5)(CTX),f 444*4882a593Smuzhiyun addm (4*6)(CTX),g 445*4882a593Smuzhiyun addm (4*7)(CTX),h 446*4882a593Smuzhiyun 447*4882a593Smuzhiyun mov _INP(%rsp), INP 448*4882a593Smuzhiyun add $64, INP 449*4882a593Smuzhiyun cmp _INP_END(%rsp), INP 450*4882a593Smuzhiyun jne loop0 451*4882a593Smuzhiyun 452*4882a593Smuzhiyundone_hash: 453*4882a593Smuzhiyun 454*4882a593Smuzhiyun mov %rbp, %rsp 455*4882a593Smuzhiyun popq %rbp 456*4882a593Smuzhiyun popq %r15 457*4882a593Smuzhiyun popq %r14 458*4882a593Smuzhiyun popq %r13 459*4882a593Smuzhiyun popq %r12 460*4882a593Smuzhiyun popq %rbx 461*4882a593Smuzhiyun RET 462*4882a593SmuzhiyunSYM_FUNC_END(sha256_transform_avx) 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun.section .rodata.cst256.K256, "aM", @progbits, 256 465*4882a593Smuzhiyun.align 64 466*4882a593SmuzhiyunK256: 467*4882a593Smuzhiyun .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 468*4882a593Smuzhiyun .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 469*4882a593Smuzhiyun .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 470*4882a593Smuzhiyun .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 471*4882a593Smuzhiyun .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 472*4882a593Smuzhiyun .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 473*4882a593Smuzhiyun .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 474*4882a593Smuzhiyun .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 475*4882a593Smuzhiyun .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 476*4882a593Smuzhiyun .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 477*4882a593Smuzhiyun .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 478*4882a593Smuzhiyun .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 479*4882a593Smuzhiyun .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 480*4882a593Smuzhiyun .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 481*4882a593Smuzhiyun .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 482*4882a593Smuzhiyun .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 483*4882a593Smuzhiyun 484*4882a593Smuzhiyun.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 485*4882a593Smuzhiyun.align 16 486*4882a593SmuzhiyunPSHUFFLE_BYTE_FLIP_MASK: 487*4882a593Smuzhiyun .octa 0x0c0d0e0f08090a0b0405060700010203 488*4882a593Smuzhiyun 489*4882a593Smuzhiyun.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 490*4882a593Smuzhiyun.align 16 491*4882a593Smuzhiyun# shuffle xBxA -> 00BA 492*4882a593Smuzhiyun_SHUF_00BA: 493*4882a593Smuzhiyun .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 494*4882a593Smuzhiyun 495*4882a593Smuzhiyun.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 496*4882a593Smuzhiyun.align 16 497*4882a593Smuzhiyun# shuffle xDxC -> DC00 498*4882a593Smuzhiyun_SHUF_DC00: 499*4882a593Smuzhiyun .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 500