1*4882a593Smuzhiyun######################################################################## 2*4882a593Smuzhiyun# Implement fast SHA-256 with SSSE3 instructions. (x86_64) 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# Copyright (C) 2013 Intel Corporation. 5*4882a593Smuzhiyun# 6*4882a593Smuzhiyun# Authors: 7*4882a593Smuzhiyun# James Guilford <james.guilford@intel.com> 8*4882a593Smuzhiyun# Kirk Yap <kirk.s.yap@intel.com> 9*4882a593Smuzhiyun# Tim Chen <tim.c.chen@linux.intel.com> 10*4882a593Smuzhiyun# 11*4882a593Smuzhiyun# This software is available to you under a choice of one of two 12*4882a593Smuzhiyun# licenses. You may choose to be licensed under the terms of the GNU 13*4882a593Smuzhiyun# General Public License (GPL) Version 2, available from the file 14*4882a593Smuzhiyun# COPYING in the main directory of this source tree, or the 15*4882a593Smuzhiyun# OpenIB.org BSD license below: 16*4882a593Smuzhiyun# 17*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or 18*4882a593Smuzhiyun# without modification, are permitted provided that the following 19*4882a593Smuzhiyun# conditions are met: 20*4882a593Smuzhiyun# 21*4882a593Smuzhiyun# - Redistributions of source code must retain the above 22*4882a593Smuzhiyun# copyright notice, this list of conditions and the following 23*4882a593Smuzhiyun# disclaimer. 24*4882a593Smuzhiyun# 25*4882a593Smuzhiyun# - Redistributions in binary form must reproduce the above 26*4882a593Smuzhiyun# copyright notice, this list of conditions and the following 27*4882a593Smuzhiyun# disclaimer in the documentation and/or other materials 28*4882a593Smuzhiyun# provided with the distribution. 29*4882a593Smuzhiyun# 30*4882a593Smuzhiyun# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31*4882a593Smuzhiyun# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32*4882a593Smuzhiyun# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33*4882a593Smuzhiyun# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34*4882a593Smuzhiyun# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35*4882a593Smuzhiyun# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36*4882a593Smuzhiyun# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37*4882a593Smuzhiyun# SOFTWARE. 38*4882a593Smuzhiyun# 39*4882a593Smuzhiyun######################################################################## 40*4882a593Smuzhiyun# 41*4882a593Smuzhiyun# This code is described in an Intel White-Paper: 42*4882a593Smuzhiyun# "Fast SHA-256 Implementations on Intel Architecture Processors" 43*4882a593Smuzhiyun# 44*4882a593Smuzhiyun# To find it, surf to http://www.intel.com/p/en_US/embedded 45*4882a593Smuzhiyun# and search for that title. 46*4882a593Smuzhiyun# 47*4882a593Smuzhiyun######################################################################## 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun#include <linux/linkage.h> 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun## assume buffers not aligned 52*4882a593Smuzhiyun#define MOVDQ movdqu 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun################################ Define Macros 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun# addm [mem], reg 57*4882a593Smuzhiyun# Add reg to mem using reg-mem add and store 58*4882a593Smuzhiyun.macro addm p1 p2 59*4882a593Smuzhiyun add \p1, \p2 60*4882a593Smuzhiyun mov \p2, \p1 61*4882a593Smuzhiyun.endm 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun################################ 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 66*4882a593Smuzhiyun# Load xmm with mem and byte swap each dword 67*4882a593Smuzhiyun.macro COPY_XMM_AND_BSWAP p1 p2 p3 68*4882a593Smuzhiyun MOVDQ \p2, \p1 69*4882a593Smuzhiyun pshufb \p3, \p1 70*4882a593Smuzhiyun.endm 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun################################ 73*4882a593Smuzhiyun 74*4882a593SmuzhiyunX0 = %xmm4 75*4882a593SmuzhiyunX1 = %xmm5 76*4882a593SmuzhiyunX2 = %xmm6 77*4882a593SmuzhiyunX3 = %xmm7 78*4882a593Smuzhiyun 79*4882a593SmuzhiyunXTMP0 = %xmm0 80*4882a593SmuzhiyunXTMP1 = %xmm1 81*4882a593SmuzhiyunXTMP2 = %xmm2 82*4882a593SmuzhiyunXTMP3 = %xmm3 83*4882a593SmuzhiyunXTMP4 = %xmm8 84*4882a593SmuzhiyunXFER = %xmm9 85*4882a593Smuzhiyun 86*4882a593SmuzhiyunSHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 87*4882a593SmuzhiyunSHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 88*4882a593SmuzhiyunBYTE_FLIP_MASK = %xmm12 89*4882a593Smuzhiyun 90*4882a593SmuzhiyunNUM_BLKS = %rdx # 3rd arg 91*4882a593SmuzhiyunINP = %rsi # 2nd arg 92*4882a593SmuzhiyunCTX = %rdi # 1st arg 93*4882a593Smuzhiyun 94*4882a593SmuzhiyunSRND = %rsi # clobbers INP 95*4882a593Smuzhiyunc = %ecx 96*4882a593Smuzhiyund = %r8d 97*4882a593Smuzhiyune = %edx 98*4882a593SmuzhiyunTBL = %r12 99*4882a593Smuzhiyuna = %eax 100*4882a593Smuzhiyunb = %ebx 101*4882a593Smuzhiyun 102*4882a593Smuzhiyunf = %r9d 103*4882a593Smuzhiyung = %r10d 104*4882a593Smuzhiyunh = %r11d 105*4882a593Smuzhiyun 106*4882a593Smuzhiyuny0 = %r13d 107*4882a593Smuzhiyuny1 = %r14d 108*4882a593Smuzhiyuny2 = %r15d 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun_INP_END_SIZE = 8 113*4882a593Smuzhiyun_INP_SIZE = 8 114*4882a593Smuzhiyun_XFER_SIZE = 16 115*4882a593Smuzhiyun_XMM_SAVE_SIZE = 0 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun_INP_END = 0 118*4882a593Smuzhiyun_INP = _INP_END + _INP_END_SIZE 119*4882a593Smuzhiyun_XFER = _INP + _INP_SIZE 120*4882a593Smuzhiyun_XMM_SAVE = _XFER + _XFER_SIZE 121*4882a593SmuzhiyunSTACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun# rotate_Xs 124*4882a593Smuzhiyun# Rotate values of symbols X0...X3 125*4882a593Smuzhiyun.macro rotate_Xs 126*4882a593SmuzhiyunX_ = X0 127*4882a593SmuzhiyunX0 = X1 128*4882a593SmuzhiyunX1 = X2 129*4882a593SmuzhiyunX2 = X3 130*4882a593SmuzhiyunX3 = X_ 131*4882a593Smuzhiyun.endm 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun# ROTATE_ARGS 134*4882a593Smuzhiyun# Rotate values of symbols a...h 135*4882a593Smuzhiyun.macro ROTATE_ARGS 136*4882a593SmuzhiyunTMP_ = h 137*4882a593Smuzhiyunh = g 138*4882a593Smuzhiyung = f 139*4882a593Smuzhiyunf = e 140*4882a593Smuzhiyune = d 141*4882a593Smuzhiyund = c 142*4882a593Smuzhiyunc = b 143*4882a593Smuzhiyunb = a 144*4882a593Smuzhiyuna = TMP_ 145*4882a593Smuzhiyun.endm 146*4882a593Smuzhiyun 147*4882a593Smuzhiyun.macro FOUR_ROUNDS_AND_SCHED 148*4882a593Smuzhiyun ## compute s0 four at a time and s1 two at a time 149*4882a593Smuzhiyun ## compute W[-16] + W[-7] 4 at a time 150*4882a593Smuzhiyun movdqa X3, XTMP0 151*4882a593Smuzhiyun mov e, y0 # y0 = e 152*4882a593Smuzhiyun ror $(25-11), y0 # y0 = e >> (25-11) 153*4882a593Smuzhiyun mov a, y1 # y1 = a 154*4882a593Smuzhiyun palignr $4, X2, XTMP0 # XTMP0 = W[-7] 155*4882a593Smuzhiyun ror $(22-13), y1 # y1 = a >> (22-13) 156*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 157*4882a593Smuzhiyun mov f, y2 # y2 = f 158*4882a593Smuzhiyun ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 159*4882a593Smuzhiyun movdqa X1, XTMP1 160*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 161*4882a593Smuzhiyun xor g, y2 # y2 = f^g 162*4882a593Smuzhiyun paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] 163*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 164*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 165*4882a593Smuzhiyun ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 166*4882a593Smuzhiyun ## compute s0 167*4882a593Smuzhiyun palignr $4, X0, XTMP1 # XTMP1 = W[-15] 168*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 169*4882a593Smuzhiyun ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 170*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 171*4882a593Smuzhiyun movdqa XTMP1, XTMP2 # XTMP2 = W[-15] 172*4882a593Smuzhiyun ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 173*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 174*4882a593Smuzhiyun add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH 175*4882a593Smuzhiyun movdqa XTMP1, XTMP3 # XTMP3 = W[-15] 176*4882a593Smuzhiyun mov a, y0 # y0 = a 177*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 178*4882a593Smuzhiyun mov a, y2 # y2 = a 179*4882a593Smuzhiyun pslld $(32-7), XTMP1 # 180*4882a593Smuzhiyun or c, y0 # y0 = a|c 181*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 182*4882a593Smuzhiyun and c, y2 # y2 = a&c 183*4882a593Smuzhiyun psrld $7, XTMP2 # 184*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 185*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 186*4882a593Smuzhiyun por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 187*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 188*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 189*4882a593Smuzhiyun # 190*4882a593Smuzhiyun ROTATE_ARGS # 191*4882a593Smuzhiyun movdqa XTMP3, XTMP2 # XTMP2 = W[-15] 192*4882a593Smuzhiyun mov e, y0 # y0 = e 193*4882a593Smuzhiyun mov a, y1 # y1 = a 194*4882a593Smuzhiyun movdqa XTMP3, XTMP4 # XTMP4 = W[-15] 195*4882a593Smuzhiyun ror $(25-11), y0 # y0 = e >> (25-11) 196*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 197*4882a593Smuzhiyun mov f, y2 # y2 = f 198*4882a593Smuzhiyun ror $(22-13), y1 # y1 = a >> (22-13) 199*4882a593Smuzhiyun pslld $(32-18), XTMP3 # 200*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 201*4882a593Smuzhiyun ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 202*4882a593Smuzhiyun xor g, y2 # y2 = f^g 203*4882a593Smuzhiyun psrld $18, XTMP2 # 204*4882a593Smuzhiyun ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 205*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 206*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 207*4882a593Smuzhiyun ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 208*4882a593Smuzhiyun pxor XTMP3, XTMP1 209*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 210*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 211*4882a593Smuzhiyun psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 212*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 213*4882a593Smuzhiyun add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 214*4882a593Smuzhiyun ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 215*4882a593Smuzhiyun pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 216*4882a593Smuzhiyun mov a, y0 # y0 = a 217*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 218*4882a593Smuzhiyun mov a, y2 # y2 = a 219*4882a593Smuzhiyun pxor XTMP4, XTMP1 # XTMP1 = s0 220*4882a593Smuzhiyun or c, y0 # y0 = a|c 221*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 222*4882a593Smuzhiyun and c, y2 # y2 = a&c 223*4882a593Smuzhiyun ## compute low s1 224*4882a593Smuzhiyun pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 225*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 226*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 227*4882a593Smuzhiyun paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 228*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 229*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun ROTATE_ARGS 232*4882a593Smuzhiyun movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} 233*4882a593Smuzhiyun mov e, y0 # y0 = e 234*4882a593Smuzhiyun mov a, y1 # y1 = a 235*4882a593Smuzhiyun ror $(25-11), y0 # y0 = e >> (25-11) 236*4882a593Smuzhiyun movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} 237*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 238*4882a593Smuzhiyun ror $(22-13), y1 # y1 = a >> (22-13) 239*4882a593Smuzhiyun mov f, y2 # y2 = f 240*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 241*4882a593Smuzhiyun ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 242*4882a593Smuzhiyun psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 243*4882a593Smuzhiyun xor g, y2 # y2 = f^g 244*4882a593Smuzhiyun psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 245*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 246*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 247*4882a593Smuzhiyun psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 248*4882a593Smuzhiyun ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 249*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 250*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 251*4882a593Smuzhiyun ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 252*4882a593Smuzhiyun pxor XTMP3, XTMP2 253*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 254*4882a593Smuzhiyun ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 255*4882a593Smuzhiyun add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 256*4882a593Smuzhiyun pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} 257*4882a593Smuzhiyun mov a, y0 # y0 = a 258*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 259*4882a593Smuzhiyun mov a, y2 # y2 = a 260*4882a593Smuzhiyun pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} 261*4882a593Smuzhiyun or c, y0 # y0 = a|c 262*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 263*4882a593Smuzhiyun and c, y2 # y2 = a&c 264*4882a593Smuzhiyun paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 265*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 266*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 267*4882a593Smuzhiyun ## compute high s1 268*4882a593Smuzhiyun pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} 269*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 270*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 271*4882a593Smuzhiyun # 272*4882a593Smuzhiyun ROTATE_ARGS # 273*4882a593Smuzhiyun movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} 274*4882a593Smuzhiyun mov e, y0 # y0 = e 275*4882a593Smuzhiyun ror $(25-11), y0 # y0 = e >> (25-11) 276*4882a593Smuzhiyun mov a, y1 # y1 = a 277*4882a593Smuzhiyun movdqa XTMP2, X0 # X0 = W[-2] {DDCC} 278*4882a593Smuzhiyun ror $(22-13), y1 # y1 = a >> (22-13) 279*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 280*4882a593Smuzhiyun mov f, y2 # y2 = f 281*4882a593Smuzhiyun ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 282*4882a593Smuzhiyun psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 283*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 284*4882a593Smuzhiyun xor g, y2 # y2 = f^g 285*4882a593Smuzhiyun psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 286*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 287*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 288*4882a593Smuzhiyun ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 289*4882a593Smuzhiyun psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} 290*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 291*4882a593Smuzhiyun ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 292*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 293*4882a593Smuzhiyun pxor XTMP3, XTMP2 # 294*4882a593Smuzhiyun ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 295*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 296*4882a593Smuzhiyun add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 297*4882a593Smuzhiyun pxor XTMP2, X0 # X0 = s1 {xDxC} 298*4882a593Smuzhiyun mov a, y0 # y0 = a 299*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 300*4882a593Smuzhiyun mov a, y2 # y2 = a 301*4882a593Smuzhiyun pshufb SHUF_DC00, X0 # X0 = s1 {DC00} 302*4882a593Smuzhiyun or c, y0 # y0 = a|c 303*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 304*4882a593Smuzhiyun and c, y2 # y2 = a&c 305*4882a593Smuzhiyun paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} 306*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 307*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 308*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 309*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 310*4882a593Smuzhiyun 311*4882a593Smuzhiyun ROTATE_ARGS 312*4882a593Smuzhiyun rotate_Xs 313*4882a593Smuzhiyun.endm 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun## input is [rsp + _XFER + %1 * 4] 316*4882a593Smuzhiyun.macro DO_ROUND round 317*4882a593Smuzhiyun mov e, y0 # y0 = e 318*4882a593Smuzhiyun ror $(25-11), y0 # y0 = e >> (25-11) 319*4882a593Smuzhiyun mov a, y1 # y1 = a 320*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (25-11)) 321*4882a593Smuzhiyun ror $(22-13), y1 # y1 = a >> (22-13) 322*4882a593Smuzhiyun mov f, y2 # y2 = f 323*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (22-13) 324*4882a593Smuzhiyun ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 325*4882a593Smuzhiyun xor g, y2 # y2 = f^g 326*4882a593Smuzhiyun xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 327*4882a593Smuzhiyun ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 328*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e 329*4882a593Smuzhiyun xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 330*4882a593Smuzhiyun ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 331*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g 332*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH 333*4882a593Smuzhiyun ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 334*4882a593Smuzhiyun offset = \round * 4 + _XFER 335*4882a593Smuzhiyun add offset(%rsp), y2 # y2 = k + w + S1 + CH 336*4882a593Smuzhiyun mov a, y0 # y0 = a 337*4882a593Smuzhiyun add y2, h # h = h + S1 + CH + k + w 338*4882a593Smuzhiyun mov a, y2 # y2 = a 339*4882a593Smuzhiyun or c, y0 # y0 = a|c 340*4882a593Smuzhiyun add h, d # d = d + h + S1 + CH + k + w 341*4882a593Smuzhiyun and c, y2 # y2 = a&c 342*4882a593Smuzhiyun and b, y0 # y0 = (a|c)&b 343*4882a593Smuzhiyun add y1, h # h = h + S1 + CH + k + w + S0 344*4882a593Smuzhiyun or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 345*4882a593Smuzhiyun add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 346*4882a593Smuzhiyun ROTATE_ARGS 347*4882a593Smuzhiyun.endm 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun######################################################################## 350*4882a593Smuzhiyun## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, 351*4882a593Smuzhiyun## int blocks); 352*4882a593Smuzhiyun## arg 1 : pointer to state 353*4882a593Smuzhiyun## (struct sha256_state is assumed to begin with u32 state[8]) 354*4882a593Smuzhiyun## arg 2 : pointer to input data 355*4882a593Smuzhiyun## arg 3 : Num blocks 356*4882a593Smuzhiyun######################################################################## 357*4882a593Smuzhiyun.text 358*4882a593SmuzhiyunSYM_FUNC_START(sha256_transform_ssse3) 359*4882a593Smuzhiyun.align 32 360*4882a593Smuzhiyun pushq %rbx 361*4882a593Smuzhiyun pushq %r12 362*4882a593Smuzhiyun pushq %r13 363*4882a593Smuzhiyun pushq %r14 364*4882a593Smuzhiyun pushq %r15 365*4882a593Smuzhiyun pushq %rbp 366*4882a593Smuzhiyun mov %rsp, %rbp 367*4882a593Smuzhiyun 368*4882a593Smuzhiyun subq $STACK_SIZE, %rsp 369*4882a593Smuzhiyun and $~15, %rsp 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun shl $6, NUM_BLKS # convert to bytes 372*4882a593Smuzhiyun jz done_hash 373*4882a593Smuzhiyun add INP, NUM_BLKS 374*4882a593Smuzhiyun mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data 375*4882a593Smuzhiyun 376*4882a593Smuzhiyun ## load initial digest 377*4882a593Smuzhiyun mov 4*0(CTX), a 378*4882a593Smuzhiyun mov 4*1(CTX), b 379*4882a593Smuzhiyun mov 4*2(CTX), c 380*4882a593Smuzhiyun mov 4*3(CTX), d 381*4882a593Smuzhiyun mov 4*4(CTX), e 382*4882a593Smuzhiyun mov 4*5(CTX), f 383*4882a593Smuzhiyun mov 4*6(CTX), g 384*4882a593Smuzhiyun mov 4*7(CTX), h 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 387*4882a593Smuzhiyun movdqa _SHUF_00BA(%rip), SHUF_00BA 388*4882a593Smuzhiyun movdqa _SHUF_DC00(%rip), SHUF_DC00 389*4882a593Smuzhiyun 390*4882a593Smuzhiyunloop0: 391*4882a593Smuzhiyun lea K256(%rip), TBL 392*4882a593Smuzhiyun 393*4882a593Smuzhiyun ## byte swap first 16 dwords 394*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 395*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 396*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 397*4882a593Smuzhiyun COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 398*4882a593Smuzhiyun 399*4882a593Smuzhiyun mov INP, _INP(%rsp) 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun ## schedule 48 input dwords, by doing 3 rounds of 16 each 402*4882a593Smuzhiyun mov $3, SRND 403*4882a593Smuzhiyun.align 16 404*4882a593Smuzhiyunloop1: 405*4882a593Smuzhiyun movdqa (TBL), XFER 406*4882a593Smuzhiyun paddd X0, XFER 407*4882a593Smuzhiyun movdqa XFER, _XFER(%rsp) 408*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 409*4882a593Smuzhiyun 410*4882a593Smuzhiyun movdqa 1*16(TBL), XFER 411*4882a593Smuzhiyun paddd X0, XFER 412*4882a593Smuzhiyun movdqa XFER, _XFER(%rsp) 413*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun movdqa 2*16(TBL), XFER 416*4882a593Smuzhiyun paddd X0, XFER 417*4882a593Smuzhiyun movdqa XFER, _XFER(%rsp) 418*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun movdqa 3*16(TBL), XFER 421*4882a593Smuzhiyun paddd X0, XFER 422*4882a593Smuzhiyun movdqa XFER, _XFER(%rsp) 423*4882a593Smuzhiyun add $4*16, TBL 424*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun sub $1, SRND 427*4882a593Smuzhiyun jne loop1 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun mov $2, SRND 430*4882a593Smuzhiyunloop2: 431*4882a593Smuzhiyun paddd (TBL), X0 432*4882a593Smuzhiyun movdqa X0, _XFER(%rsp) 433*4882a593Smuzhiyun DO_ROUND 0 434*4882a593Smuzhiyun DO_ROUND 1 435*4882a593Smuzhiyun DO_ROUND 2 436*4882a593Smuzhiyun DO_ROUND 3 437*4882a593Smuzhiyun paddd 1*16(TBL), X1 438*4882a593Smuzhiyun movdqa X1, _XFER(%rsp) 439*4882a593Smuzhiyun add $2*16, TBL 440*4882a593Smuzhiyun DO_ROUND 0 441*4882a593Smuzhiyun DO_ROUND 1 442*4882a593Smuzhiyun DO_ROUND 2 443*4882a593Smuzhiyun DO_ROUND 3 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun movdqa X2, X0 446*4882a593Smuzhiyun movdqa X3, X1 447*4882a593Smuzhiyun 448*4882a593Smuzhiyun sub $1, SRND 449*4882a593Smuzhiyun jne loop2 450*4882a593Smuzhiyun 451*4882a593Smuzhiyun addm (4*0)(CTX),a 452*4882a593Smuzhiyun addm (4*1)(CTX),b 453*4882a593Smuzhiyun addm (4*2)(CTX),c 454*4882a593Smuzhiyun addm (4*3)(CTX),d 455*4882a593Smuzhiyun addm (4*4)(CTX),e 456*4882a593Smuzhiyun addm (4*5)(CTX),f 457*4882a593Smuzhiyun addm (4*6)(CTX),g 458*4882a593Smuzhiyun addm (4*7)(CTX),h 459*4882a593Smuzhiyun 460*4882a593Smuzhiyun mov _INP(%rsp), INP 461*4882a593Smuzhiyun add $64, INP 462*4882a593Smuzhiyun cmp _INP_END(%rsp), INP 463*4882a593Smuzhiyun jne loop0 464*4882a593Smuzhiyun 465*4882a593Smuzhiyundone_hash: 466*4882a593Smuzhiyun 467*4882a593Smuzhiyun mov %rbp, %rsp 468*4882a593Smuzhiyun popq %rbp 469*4882a593Smuzhiyun popq %r15 470*4882a593Smuzhiyun popq %r14 471*4882a593Smuzhiyun popq %r13 472*4882a593Smuzhiyun popq %r12 473*4882a593Smuzhiyun popq %rbx 474*4882a593Smuzhiyun 475*4882a593Smuzhiyun RET 476*4882a593SmuzhiyunSYM_FUNC_END(sha256_transform_ssse3) 477*4882a593Smuzhiyun 478*4882a593Smuzhiyun.section .rodata.cst256.K256, "aM", @progbits, 256 479*4882a593Smuzhiyun.align 64 480*4882a593SmuzhiyunK256: 481*4882a593Smuzhiyun .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 482*4882a593Smuzhiyun .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 483*4882a593Smuzhiyun .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 484*4882a593Smuzhiyun .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 485*4882a593Smuzhiyun .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 486*4882a593Smuzhiyun .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 487*4882a593Smuzhiyun .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 488*4882a593Smuzhiyun .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 489*4882a593Smuzhiyun .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 490*4882a593Smuzhiyun .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 491*4882a593Smuzhiyun .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 492*4882a593Smuzhiyun .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 493*4882a593Smuzhiyun .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 494*4882a593Smuzhiyun .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 495*4882a593Smuzhiyun .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 496*4882a593Smuzhiyun .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 497*4882a593Smuzhiyun 498*4882a593Smuzhiyun.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 499*4882a593Smuzhiyun.align 16 500*4882a593SmuzhiyunPSHUFFLE_BYTE_FLIP_MASK: 501*4882a593Smuzhiyun .octa 0x0c0d0e0f08090a0b0405060700010203 502*4882a593Smuzhiyun 503*4882a593Smuzhiyun.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 504*4882a593Smuzhiyun.align 16 505*4882a593Smuzhiyun# shuffle xBxA -> 00BA 506*4882a593Smuzhiyun_SHUF_00BA: 507*4882a593Smuzhiyun .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 508*4882a593Smuzhiyun 509*4882a593Smuzhiyun.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 510*4882a593Smuzhiyun.align 16 511*4882a593Smuzhiyun# shuffle xDxC -> DC00 512*4882a593Smuzhiyun_SHUF_DC00: 513*4882a593Smuzhiyun .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 514