1*4882a593Smuzhiyun######################################################################## 2*4882a593Smuzhiyun# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# Copyright (C) 2013 Intel Corporation. 5*4882a593Smuzhiyun# 6*4882a593Smuzhiyun# Authors: 7*4882a593Smuzhiyun# James Guilford <james.guilford@intel.com> 8*4882a593Smuzhiyun# Kirk Yap <kirk.s.yap@intel.com> 9*4882a593Smuzhiyun# Tim Chen <tim.c.chen@linux.intel.com> 10*4882a593Smuzhiyun# 11*4882a593Smuzhiyun# This software is available to you under a choice of one of two 12*4882a593Smuzhiyun# licenses. You may choose to be licensed under the terms of the GNU 13*4882a593Smuzhiyun# General Public License (GPL) Version 2, available from the file 14*4882a593Smuzhiyun# COPYING in the main directory of this source tree, or the 15*4882a593Smuzhiyun# OpenIB.org BSD license below: 16*4882a593Smuzhiyun# 17*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or 18*4882a593Smuzhiyun# without modification, are permitted provided that the following 19*4882a593Smuzhiyun# conditions are met: 20*4882a593Smuzhiyun# 21*4882a593Smuzhiyun# - Redistributions of source code must retain the above 22*4882a593Smuzhiyun# copyright notice, this list of conditions and the following 23*4882a593Smuzhiyun# disclaimer. 24*4882a593Smuzhiyun# 25*4882a593Smuzhiyun# - Redistributions in binary form must reproduce the above 26*4882a593Smuzhiyun# copyright notice, this list of conditions and the following 27*4882a593Smuzhiyun# disclaimer in the documentation and/or other materials 28*4882a593Smuzhiyun# provided with the distribution. 29*4882a593Smuzhiyun# 30*4882a593Smuzhiyun# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31*4882a593Smuzhiyun# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32*4882a593Smuzhiyun# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33*4882a593Smuzhiyun# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34*4882a593Smuzhiyun# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35*4882a593Smuzhiyun# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36*4882a593Smuzhiyun# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37*4882a593Smuzhiyun# SOFTWARE. 38*4882a593Smuzhiyun# 39*4882a593Smuzhiyun######################################################################## 40*4882a593Smuzhiyun# 41*4882a593Smuzhiyun# This code is described in an Intel White-Paper: 42*4882a593Smuzhiyun# "Fast SHA-256 Implementations on Intel Architecture Processors" 43*4882a593Smuzhiyun# 44*4882a593Smuzhiyun# To find it, surf to http://www.intel.com/p/en_US/embedded 45*4882a593Smuzhiyun# and search for that title. 46*4882a593Smuzhiyun# 47*4882a593Smuzhiyun######################################################################## 48*4882a593Smuzhiyun# This code schedules 2 blocks at a time, with 4 lanes per block 49*4882a593Smuzhiyun######################################################################## 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun#include <linux/linkage.h> 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun## assume buffers not aligned 54*4882a593Smuzhiyun#define VMOVDQ vmovdqu 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun################################ Define Macros 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun# addm [mem], reg 59*4882a593Smuzhiyun# Add reg to mem using reg-mem add and store 60*4882a593Smuzhiyun.macro addm p1 p2 61*4882a593Smuzhiyun add \p1, \p2 62*4882a593Smuzhiyun mov \p2, \p1 63*4882a593Smuzhiyun.endm 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun################################ 66*4882a593Smuzhiyun 67*4882a593SmuzhiyunX0 = %ymm4 68*4882a593SmuzhiyunX1 = %ymm5 69*4882a593SmuzhiyunX2 = %ymm6 70*4882a593SmuzhiyunX3 = %ymm7 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun# XMM versions of above 73*4882a593SmuzhiyunXWORD0 = %xmm4 74*4882a593SmuzhiyunXWORD1 = %xmm5 75*4882a593SmuzhiyunXWORD2 = %xmm6 76*4882a593SmuzhiyunXWORD3 = %xmm7 77*4882a593Smuzhiyun 78*4882a593SmuzhiyunXTMP0 = %ymm0 79*4882a593SmuzhiyunXTMP1 = %ymm1 80*4882a593SmuzhiyunXTMP2 = %ymm2 81*4882a593SmuzhiyunXTMP3 = %ymm3 82*4882a593SmuzhiyunXTMP4 = %ymm8 83*4882a593SmuzhiyunXFER = %ymm9 84*4882a593SmuzhiyunXTMP5 = %ymm11 85*4882a593Smuzhiyun 86*4882a593SmuzhiyunSHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 87*4882a593SmuzhiyunSHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 88*4882a593SmuzhiyunBYTE_FLIP_MASK = %ymm13 89*4882a593Smuzhiyun 90*4882a593SmuzhiyunX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 91*4882a593Smuzhiyun 92*4882a593SmuzhiyunNUM_BLKS = %rdx # 3rd arg 93*4882a593SmuzhiyunINP = %rsi # 2nd arg 94*4882a593SmuzhiyunCTX = %rdi # 1st arg 95*4882a593Smuzhiyunc = %ecx 96*4882a593Smuzhiyund = %r8d 97*4882a593Smuzhiyune = %edx # clobbers NUM_BLKS 98*4882a593Smuzhiyuny3 = %esi # clobbers INP 99*4882a593Smuzhiyun 100*4882a593SmuzhiyunSRND = CTX # SRND is same register as CTX 101*4882a593Smuzhiyun 102*4882a593Smuzhiyuna = %eax 103*4882a593Smuzhiyunb = %ebx 104*4882a593Smuzhiyunf = %r9d 105*4882a593Smuzhiyung = %r10d 106*4882a593Smuzhiyunh = %r11d 107*4882a593Smuzhiyunold_h = %r11d 108*4882a593Smuzhiyun 109*4882a593SmuzhiyunT1 = %r12d 110*4882a593Smuzhiyuny0 = %r13d 111*4882a593Smuzhiyuny1 = %r14d 112*4882a593Smuzhiyuny2 = %r15d 113*4882a593Smuzhiyun 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 116*4882a593Smuzhiyun_XMM_SAVE_SIZE = 0 117*4882a593Smuzhiyun_INP_END_SIZE = 8 118*4882a593Smuzhiyun_INP_SIZE = 8 119*4882a593Smuzhiyun_CTX_SIZE = 8 120*4882a593Smuzhiyun_RSP_SIZE = 8 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun_XFER = 0 123*4882a593Smuzhiyun_XMM_SAVE = _XFER + _XFER_SIZE 124*4882a593Smuzhiyun_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 125*4882a593Smuzhiyun_INP = _INP_END + _INP_END_SIZE 126*4882a593Smuzhiyun_CTX = _INP + _INP_SIZE 127*4882a593Smuzhiyun_RSP = _CTX + _CTX_SIZE 128*4882a593SmuzhiyunSTACK_SIZE = _RSP + _RSP_SIZE 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun# rotate_Xs 131*4882a593Smuzhiyun# Rotate values of symbols X0...X3 132*4882a593Smuzhiyun.macro rotate_Xs 133*4882a593Smuzhiyun X_ = X0 134*4882a593Smuzhiyun X0 = X1 135*4882a593Smuzhiyun X1 = X2 136*4882a593Smuzhiyun X2 = X3 137*4882a593Smuzhiyun X3 = X_ 138*4882a593Smuzhiyun.endm 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun# ROTATE_ARGS 141*4882a593Smuzhiyun# Rotate values of symbols a...h 142*4882a593Smuzhiyun.macro ROTATE_ARGS 143*4882a593Smuzhiyun old_h = h 144*4882a593Smuzhiyun TMP_ = h 145*4882a593Smuzhiyun h = g 146*4882a593Smuzhiyun g = f 147*4882a593Smuzhiyun f = e 148*4882a593Smuzhiyun e = d 149*4882a593Smuzhiyun d = c 150*4882a593Smuzhiyun c = b 151*4882a593Smuzhiyun b = a 152*4882a593Smuzhiyun a = TMP_ 153*4882a593Smuzhiyun.endm 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun.macro FOUR_ROUNDS_AND_SCHED disp 156*4882a593Smuzhiyun################################### RND N + 0 ############################ 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 159*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 160*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 161*4882a593Smuzhiyun 162*4882a593Smuzhiyun addl \disp(%rsp, SRND), h # h = k + w + h # -- 163*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 164*4882a593Smuzhiyun vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 165*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 166*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 169*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 170*4882a593Smuzhiyun vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 171*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 174*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 175*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 176*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 179*4882a593Smuzhiyun vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 180*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 181*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 182*4882a593Smuzhiyun 183*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 184*4882a593Smuzhiyun vpsrld $7, XTMP1, XTMP2 185*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 186*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 187*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 188*4882a593Smuzhiyun 189*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 190*4882a593Smuzhiyun vpslld $(32-7), XTMP1, XTMP3 191*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 192*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 195*4882a593Smuzhiyun vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun vpsrld $18, XTMP1, XTMP2 198*4882a593Smuzhiyun add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 199*4882a593Smuzhiyun add y3, h # h = t1 + S0 + MAJ # -- 200*4882a593Smuzhiyun 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun ROTATE_ARGS 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun################################### RND N + 1 ############################ 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 207*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 208*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 209*4882a593Smuzhiyun offset = \disp + 1*4 210*4882a593Smuzhiyun addl offset(%rsp, SRND), h # h = k + w + h # -- 211*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 215*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 216*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 217*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 218*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 222*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 223*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 224*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 225*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun vpslld $(32-18), XTMP1, XTMP1 228*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 229*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun vpxor XTMP1, XTMP3, XTMP3 232*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 233*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 236*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 237*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 238*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 239*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 240*4882a593Smuzhiyun 241*4882a593Smuzhiyun vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 242*4882a593Smuzhiyun vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 243*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 244*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 245*4882a593Smuzhiyun 246*4882a593Smuzhiyun vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 247*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 248*4882a593Smuzhiyun add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 249*4882a593Smuzhiyun add y3, h # h = t1 + S0 + MAJ # -- 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun ROTATE_ARGS 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun################################### RND N + 2 ############################ 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 259*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 260*4882a593Smuzhiyun offset = \disp + 2*4 261*4882a593Smuzhiyun addl offset(%rsp, SRND), h # h = k + w + h # -- 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 264*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 265*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 266*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 267*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 270*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 271*4882a593Smuzhiyun vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 272*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 275*4882a593Smuzhiyun vpxor XTMP3, XTMP2, XTMP2 276*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 277*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 280*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 281*4882a593Smuzhiyun vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 282*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 283*4882a593Smuzhiyun 284*4882a593Smuzhiyun vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 285*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 286*4882a593Smuzhiyun rorx $2, a ,T1 # T1 = (a >> 2) # S0 287*4882a593Smuzhiyun vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 290*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 291*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 292*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 293*4882a593Smuzhiyun vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 294*4882a593Smuzhiyun 295*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 296*4882a593Smuzhiyun add y1,h # h = k + w + h + S0 # -- 297*4882a593Smuzhiyun add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 298*4882a593Smuzhiyun add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun add y3,h # h = t1 + S0 + MAJ # -- 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun ROTATE_ARGS 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun################################### RND N + 3 ############################ 306*4882a593Smuzhiyun 307*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 308*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 309*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 310*4882a593Smuzhiyun offset = \disp + 3*4 311*4882a593Smuzhiyun addl offset(%rsp, SRND), h # h = k + w + h # -- 312*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 313*4882a593Smuzhiyun 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 316*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 317*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 318*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 319*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 323*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 324*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 325*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 326*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 327*4882a593Smuzhiyun 328*4882a593Smuzhiyun vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 329*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 330*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 331*4882a593Smuzhiyun 332*4882a593Smuzhiyun vpxor XTMP3, XTMP2, XTMP2 333*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 334*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 335*4882a593Smuzhiyun 336*4882a593Smuzhiyun vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 337*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 338*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 339*4882a593Smuzhiyun 340*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 341*4882a593Smuzhiyun vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 342*4882a593Smuzhiyun 343*4882a593Smuzhiyun vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 344*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 345*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 346*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 347*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 350*4882a593Smuzhiyun add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 351*4882a593Smuzhiyun add y3, h # h = t1 + S0 + MAJ # -- 352*4882a593Smuzhiyun 353*4882a593Smuzhiyun ROTATE_ARGS 354*4882a593Smuzhiyun rotate_Xs 355*4882a593Smuzhiyun.endm 356*4882a593Smuzhiyun 357*4882a593Smuzhiyun.macro DO_4ROUNDS disp 358*4882a593Smuzhiyun################################### RND N + 0 ########################### 359*4882a593Smuzhiyun 360*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 361*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 362*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 363*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 364*4882a593Smuzhiyun 365*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 366*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 367*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 368*4882a593Smuzhiyun 369*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 370*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 371*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 372*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 373*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 376*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 377*4882a593Smuzhiyun addl \disp(%rsp, SRND), h # h = k + w + h # -- 378*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 379*4882a593Smuzhiyun 380*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 381*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 382*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 383*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 384*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun 387*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 388*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 389*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 390*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 391*4882a593Smuzhiyun 392*4882a593Smuzhiyun ROTATE_ARGS 393*4882a593Smuzhiyun 394*4882a593Smuzhiyun################################### RND N + 1 ########################### 395*4882a593Smuzhiyun 396*4882a593Smuzhiyun add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 397*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 398*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 399*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 400*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 401*4882a593Smuzhiyun 402*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 403*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 404*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 405*4882a593Smuzhiyun add y3, old_h # h = t1 + S0 + MAJ # -- 406*4882a593Smuzhiyun 407*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 408*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 409*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 410*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 411*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 414*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 415*4882a593Smuzhiyun offset = 4*1 + \disp 416*4882a593Smuzhiyun addl offset(%rsp, SRND), h # h = k + w + h # -- 417*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 418*4882a593Smuzhiyun 419*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 420*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 421*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 422*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 423*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 424*4882a593Smuzhiyun 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 427*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 428*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 429*4882a593Smuzhiyun 430*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 431*4882a593Smuzhiyun 432*4882a593Smuzhiyun ROTATE_ARGS 433*4882a593Smuzhiyun 434*4882a593Smuzhiyun################################### RND N + 2 ############################## 435*4882a593Smuzhiyun 436*4882a593Smuzhiyun add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 437*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 438*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 439*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 440*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 443*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 444*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 445*4882a593Smuzhiyun add y3, old_h # h = t1 + S0 + MAJ # -- 446*4882a593Smuzhiyun 447*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 448*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 449*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 450*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 451*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 454*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 455*4882a593Smuzhiyun offset = 4*2 + \disp 456*4882a593Smuzhiyun addl offset(%rsp, SRND), h # h = k + w + h # -- 457*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 460*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 461*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 462*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 463*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 464*4882a593Smuzhiyun 465*4882a593Smuzhiyun 466*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 467*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 468*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 469*4882a593Smuzhiyun 470*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 471*4882a593Smuzhiyun 472*4882a593Smuzhiyun ROTATE_ARGS 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun################################### RND N + 3 ########################### 475*4882a593Smuzhiyun 476*4882a593Smuzhiyun add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 477*4882a593Smuzhiyun mov f, y2 # y2 = f # CH 478*4882a593Smuzhiyun rorx $25, e, y0 # y0 = e >> 25 # S1A 479*4882a593Smuzhiyun rorx $11, e, y1 # y1 = e >> 11 # S1B 480*4882a593Smuzhiyun xor g, y2 # y2 = f^g # CH 481*4882a593Smuzhiyun 482*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 483*4882a593Smuzhiyun rorx $6, e, y1 # y1 = (e >> 6) # S1 484*4882a593Smuzhiyun and e, y2 # y2 = (f^g)&e # CH 485*4882a593Smuzhiyun add y3, old_h # h = t1 + S0 + MAJ # -- 486*4882a593Smuzhiyun 487*4882a593Smuzhiyun xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 488*4882a593Smuzhiyun rorx $13, a, T1 # T1 = a >> 13 # S0B 489*4882a593Smuzhiyun xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 490*4882a593Smuzhiyun rorx $22, a, y1 # y1 = a >> 22 # S0A 491*4882a593Smuzhiyun mov a, y3 # y3 = a # MAJA 492*4882a593Smuzhiyun 493*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 494*4882a593Smuzhiyun rorx $2, a, T1 # T1 = (a >> 2) # S0 495*4882a593Smuzhiyun offset = 4*3 + \disp 496*4882a593Smuzhiyun addl offset(%rsp, SRND), h # h = k + w + h # -- 497*4882a593Smuzhiyun or c, y3 # y3 = a|c # MAJA 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 500*4882a593Smuzhiyun mov a, T1 # T1 = a # MAJB 501*4882a593Smuzhiyun and b, y3 # y3 = (a|c)&b # MAJA 502*4882a593Smuzhiyun and c, T1 # T1 = a&c # MAJB 503*4882a593Smuzhiyun add y0, y2 # y2 = S1 + CH # -- 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun 506*4882a593Smuzhiyun add h, d # d = k + w + h + d # -- 507*4882a593Smuzhiyun or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 508*4882a593Smuzhiyun add y1, h # h = k + w + h + S0 # -- 509*4882a593Smuzhiyun 510*4882a593Smuzhiyun add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun 513*4882a593Smuzhiyun add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 514*4882a593Smuzhiyun 515*4882a593Smuzhiyun add y3, h # h = t1 + S0 + MAJ # -- 516*4882a593Smuzhiyun 517*4882a593Smuzhiyun ROTATE_ARGS 518*4882a593Smuzhiyun 519*4882a593Smuzhiyun.endm 520*4882a593Smuzhiyun 521*4882a593Smuzhiyun######################################################################## 522*4882a593Smuzhiyun## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) 523*4882a593Smuzhiyun## arg 1 : pointer to state 524*4882a593Smuzhiyun## arg 2 : pointer to input data 525*4882a593Smuzhiyun## arg 3 : Num blocks 526*4882a593Smuzhiyun######################################################################## 527*4882a593Smuzhiyun.text 528*4882a593SmuzhiyunSYM_FUNC_START(sha256_transform_rorx) 529*4882a593Smuzhiyun.align 32 530*4882a593Smuzhiyun pushq %rbx 531*4882a593Smuzhiyun pushq %r12 532*4882a593Smuzhiyun pushq %r13 533*4882a593Smuzhiyun pushq %r14 534*4882a593Smuzhiyun pushq %r15 535*4882a593Smuzhiyun 536*4882a593Smuzhiyun mov %rsp, %rax 537*4882a593Smuzhiyun subq $STACK_SIZE, %rsp 538*4882a593Smuzhiyun and $-32, %rsp # align rsp to 32 byte boundary 539*4882a593Smuzhiyun mov %rax, _RSP(%rsp) 540*4882a593Smuzhiyun 541*4882a593Smuzhiyun 542*4882a593Smuzhiyun shl $6, NUM_BLKS # convert to bytes 543*4882a593Smuzhiyun jz done_hash 544*4882a593Smuzhiyun lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 545*4882a593Smuzhiyun mov NUM_BLKS, _INP_END(%rsp) 546*4882a593Smuzhiyun 547*4882a593Smuzhiyun cmp NUM_BLKS, INP 548*4882a593Smuzhiyun je only_one_block 549*4882a593Smuzhiyun 550*4882a593Smuzhiyun ## load initial digest 551*4882a593Smuzhiyun mov (CTX), a 552*4882a593Smuzhiyun mov 4*1(CTX), b 553*4882a593Smuzhiyun mov 4*2(CTX), c 554*4882a593Smuzhiyun mov 4*3(CTX), d 555*4882a593Smuzhiyun mov 4*4(CTX), e 556*4882a593Smuzhiyun mov 4*5(CTX), f 557*4882a593Smuzhiyun mov 4*6(CTX), g 558*4882a593Smuzhiyun mov 4*7(CTX), h 559*4882a593Smuzhiyun 560*4882a593Smuzhiyun vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 561*4882a593Smuzhiyun vmovdqa _SHUF_00BA(%rip), SHUF_00BA 562*4882a593Smuzhiyun vmovdqa _SHUF_DC00(%rip), SHUF_DC00 563*4882a593Smuzhiyun 564*4882a593Smuzhiyun mov CTX, _CTX(%rsp) 565*4882a593Smuzhiyun 566*4882a593Smuzhiyunloop0: 567*4882a593Smuzhiyun ## Load first 16 dwords from two blocks 568*4882a593Smuzhiyun VMOVDQ 0*32(INP),XTMP0 569*4882a593Smuzhiyun VMOVDQ 1*32(INP),XTMP1 570*4882a593Smuzhiyun VMOVDQ 2*32(INP),XTMP2 571*4882a593Smuzhiyun VMOVDQ 3*32(INP),XTMP3 572*4882a593Smuzhiyun 573*4882a593Smuzhiyun ## byte swap data 574*4882a593Smuzhiyun vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 575*4882a593Smuzhiyun vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 576*4882a593Smuzhiyun vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 577*4882a593Smuzhiyun vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 578*4882a593Smuzhiyun 579*4882a593Smuzhiyun ## transpose data into high/low halves 580*4882a593Smuzhiyun vperm2i128 $0x20, XTMP2, XTMP0, X0 581*4882a593Smuzhiyun vperm2i128 $0x31, XTMP2, XTMP0, X1 582*4882a593Smuzhiyun vperm2i128 $0x20, XTMP3, XTMP1, X2 583*4882a593Smuzhiyun vperm2i128 $0x31, XTMP3, XTMP1, X3 584*4882a593Smuzhiyun 585*4882a593Smuzhiyunlast_block_enter: 586*4882a593Smuzhiyun add $64, INP 587*4882a593Smuzhiyun mov INP, _INP(%rsp) 588*4882a593Smuzhiyun 589*4882a593Smuzhiyun ## schedule 48 input dwords, by doing 3 rounds of 12 each 590*4882a593Smuzhiyun xor SRND, SRND 591*4882a593Smuzhiyun 592*4882a593Smuzhiyun.align 16 593*4882a593Smuzhiyunloop1: 594*4882a593Smuzhiyun vpaddd K256+0*32(SRND), X0, XFER 595*4882a593Smuzhiyun vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 596*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED _XFER + 0*32 597*4882a593Smuzhiyun 598*4882a593Smuzhiyun vpaddd K256+1*32(SRND), X0, XFER 599*4882a593Smuzhiyun vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 600*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED _XFER + 1*32 601*4882a593Smuzhiyun 602*4882a593Smuzhiyun vpaddd K256+2*32(SRND), X0, XFER 603*4882a593Smuzhiyun vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 604*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED _XFER + 2*32 605*4882a593Smuzhiyun 606*4882a593Smuzhiyun vpaddd K256+3*32(SRND), X0, XFER 607*4882a593Smuzhiyun vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 608*4882a593Smuzhiyun FOUR_ROUNDS_AND_SCHED _XFER + 3*32 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun add $4*32, SRND 611*4882a593Smuzhiyun cmp $3*4*32, SRND 612*4882a593Smuzhiyun jb loop1 613*4882a593Smuzhiyun 614*4882a593Smuzhiyunloop2: 615*4882a593Smuzhiyun ## Do last 16 rounds with no scheduling 616*4882a593Smuzhiyun vpaddd K256+0*32(SRND), X0, XFER 617*4882a593Smuzhiyun vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 618*4882a593Smuzhiyun DO_4ROUNDS _XFER + 0*32 619*4882a593Smuzhiyun 620*4882a593Smuzhiyun vpaddd K256+1*32(SRND), X1, XFER 621*4882a593Smuzhiyun vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 622*4882a593Smuzhiyun DO_4ROUNDS _XFER + 1*32 623*4882a593Smuzhiyun add $2*32, SRND 624*4882a593Smuzhiyun 625*4882a593Smuzhiyun vmovdqa X2, X0 626*4882a593Smuzhiyun vmovdqa X3, X1 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun cmp $4*4*32, SRND 629*4882a593Smuzhiyun jb loop2 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun mov _CTX(%rsp), CTX 632*4882a593Smuzhiyun mov _INP(%rsp), INP 633*4882a593Smuzhiyun 634*4882a593Smuzhiyun addm (4*0)(CTX),a 635*4882a593Smuzhiyun addm (4*1)(CTX),b 636*4882a593Smuzhiyun addm (4*2)(CTX),c 637*4882a593Smuzhiyun addm (4*3)(CTX),d 638*4882a593Smuzhiyun addm (4*4)(CTX),e 639*4882a593Smuzhiyun addm (4*5)(CTX),f 640*4882a593Smuzhiyun addm (4*6)(CTX),g 641*4882a593Smuzhiyun addm (4*7)(CTX),h 642*4882a593Smuzhiyun 643*4882a593Smuzhiyun cmp _INP_END(%rsp), INP 644*4882a593Smuzhiyun ja done_hash 645*4882a593Smuzhiyun 646*4882a593Smuzhiyun #### Do second block using previously scheduled results 647*4882a593Smuzhiyun xor SRND, SRND 648*4882a593Smuzhiyun.align 16 649*4882a593Smuzhiyunloop3: 650*4882a593Smuzhiyun DO_4ROUNDS _XFER + 0*32 + 16 651*4882a593Smuzhiyun DO_4ROUNDS _XFER + 1*32 + 16 652*4882a593Smuzhiyun add $2*32, SRND 653*4882a593Smuzhiyun cmp $4*4*32, SRND 654*4882a593Smuzhiyun jb loop3 655*4882a593Smuzhiyun 656*4882a593Smuzhiyun mov _CTX(%rsp), CTX 657*4882a593Smuzhiyun mov _INP(%rsp), INP 658*4882a593Smuzhiyun add $64, INP 659*4882a593Smuzhiyun 660*4882a593Smuzhiyun addm (4*0)(CTX),a 661*4882a593Smuzhiyun addm (4*1)(CTX),b 662*4882a593Smuzhiyun addm (4*2)(CTX),c 663*4882a593Smuzhiyun addm (4*3)(CTX),d 664*4882a593Smuzhiyun addm (4*4)(CTX),e 665*4882a593Smuzhiyun addm (4*5)(CTX),f 666*4882a593Smuzhiyun addm (4*6)(CTX),g 667*4882a593Smuzhiyun addm (4*7)(CTX),h 668*4882a593Smuzhiyun 669*4882a593Smuzhiyun cmp _INP_END(%rsp), INP 670*4882a593Smuzhiyun jb loop0 671*4882a593Smuzhiyun ja done_hash 672*4882a593Smuzhiyun 673*4882a593Smuzhiyundo_last_block: 674*4882a593Smuzhiyun VMOVDQ 0*16(INP),XWORD0 675*4882a593Smuzhiyun VMOVDQ 1*16(INP),XWORD1 676*4882a593Smuzhiyun VMOVDQ 2*16(INP),XWORD2 677*4882a593Smuzhiyun VMOVDQ 3*16(INP),XWORD3 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 680*4882a593Smuzhiyun vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 681*4882a593Smuzhiyun vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 682*4882a593Smuzhiyun vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 683*4882a593Smuzhiyun 684*4882a593Smuzhiyun jmp last_block_enter 685*4882a593Smuzhiyun 686*4882a593Smuzhiyunonly_one_block: 687*4882a593Smuzhiyun 688*4882a593Smuzhiyun ## load initial digest 689*4882a593Smuzhiyun mov (4*0)(CTX),a 690*4882a593Smuzhiyun mov (4*1)(CTX),b 691*4882a593Smuzhiyun mov (4*2)(CTX),c 692*4882a593Smuzhiyun mov (4*3)(CTX),d 693*4882a593Smuzhiyun mov (4*4)(CTX),e 694*4882a593Smuzhiyun mov (4*5)(CTX),f 695*4882a593Smuzhiyun mov (4*6)(CTX),g 696*4882a593Smuzhiyun mov (4*7)(CTX),h 697*4882a593Smuzhiyun 698*4882a593Smuzhiyun vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 699*4882a593Smuzhiyun vmovdqa _SHUF_00BA(%rip), SHUF_00BA 700*4882a593Smuzhiyun vmovdqa _SHUF_DC00(%rip), SHUF_DC00 701*4882a593Smuzhiyun 702*4882a593Smuzhiyun mov CTX, _CTX(%rsp) 703*4882a593Smuzhiyun jmp do_last_block 704*4882a593Smuzhiyun 705*4882a593Smuzhiyundone_hash: 706*4882a593Smuzhiyun 707*4882a593Smuzhiyun mov _RSP(%rsp), %rsp 708*4882a593Smuzhiyun 709*4882a593Smuzhiyun popq %r15 710*4882a593Smuzhiyun popq %r14 711*4882a593Smuzhiyun popq %r13 712*4882a593Smuzhiyun popq %r12 713*4882a593Smuzhiyun popq %rbx 714*4882a593Smuzhiyun RET 715*4882a593SmuzhiyunSYM_FUNC_END(sha256_transform_rorx) 716*4882a593Smuzhiyun 717*4882a593Smuzhiyun.section .rodata.cst512.K256, "aM", @progbits, 512 718*4882a593Smuzhiyun.align 64 719*4882a593SmuzhiyunK256: 720*4882a593Smuzhiyun .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 721*4882a593Smuzhiyun .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 722*4882a593Smuzhiyun .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 723*4882a593Smuzhiyun .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 724*4882a593Smuzhiyun .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 725*4882a593Smuzhiyun .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 726*4882a593Smuzhiyun .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 727*4882a593Smuzhiyun .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 728*4882a593Smuzhiyun .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 729*4882a593Smuzhiyun .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 730*4882a593Smuzhiyun .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 731*4882a593Smuzhiyun .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 732*4882a593Smuzhiyun .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 733*4882a593Smuzhiyun .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 734*4882a593Smuzhiyun .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 735*4882a593Smuzhiyun .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 736*4882a593Smuzhiyun .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 737*4882a593Smuzhiyun .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 738*4882a593Smuzhiyun .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 739*4882a593Smuzhiyun .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 740*4882a593Smuzhiyun .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 741*4882a593Smuzhiyun .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 742*4882a593Smuzhiyun .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 743*4882a593Smuzhiyun .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 744*4882a593Smuzhiyun .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 745*4882a593Smuzhiyun .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 746*4882a593Smuzhiyun .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 747*4882a593Smuzhiyun .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 748*4882a593Smuzhiyun .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 749*4882a593Smuzhiyun .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 750*4882a593Smuzhiyun .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 751*4882a593Smuzhiyun .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 752*4882a593Smuzhiyun 753*4882a593Smuzhiyun.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 754*4882a593Smuzhiyun.align 32 755*4882a593SmuzhiyunPSHUFFLE_BYTE_FLIP_MASK: 756*4882a593Smuzhiyun .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 757*4882a593Smuzhiyun 758*4882a593Smuzhiyun# shuffle xBxA -> 00BA 759*4882a593Smuzhiyun.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 760*4882a593Smuzhiyun.align 32 761*4882a593Smuzhiyun_SHUF_00BA: 762*4882a593Smuzhiyun .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 763*4882a593Smuzhiyun 764*4882a593Smuzhiyun# shuffle xDxC -> DC00 765*4882a593Smuzhiyun.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 766*4882a593Smuzhiyun.align 32 767*4882a593Smuzhiyun_SHUF_DC00: 768*4882a593Smuzhiyun .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 769