1*4882a593Smuzhiyun#! /usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5*4882a593Smuzhiyun# has relicensed it under the GPLv2. Therefore this program is free software; 6*4882a593Smuzhiyun# you can redistribute it and/or modify it under the terms of the GNU General 7*4882a593Smuzhiyun# Public License version 2 as published by the Free Software Foundation. 8*4882a593Smuzhiyun# 9*4882a593Smuzhiyun# The original headers, including the original license headers, are 10*4882a593Smuzhiyun# included below for completeness. 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 13*4882a593Smuzhiyun# 14*4882a593Smuzhiyun# Licensed under the OpenSSL license (the "License"). You may not use 15*4882a593Smuzhiyun# this file except in compliance with the License. You can obtain a copy 16*4882a593Smuzhiyun# in the file LICENSE in the source distribution or at 17*4882a593Smuzhiyun# https://www.openssl.org/source/license.html 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun# ==================================================================== 20*4882a593Smuzhiyun# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 21*4882a593Smuzhiyun# project. The module is, however, dual licensed under OpenSSL and 22*4882a593Smuzhiyun# CRYPTOGAMS licenses depending on where you obtain it. For further 23*4882a593Smuzhiyun# details see http://www.openssl.org/~appro/cryptogams/. 24*4882a593Smuzhiyun# ==================================================================== 25*4882a593Smuzhiyun# 26*4882a593Smuzhiyun# SHA256/512 for ARMv8. 27*4882a593Smuzhiyun# 28*4882a593Smuzhiyun# Performance in cycles per processed byte and improvement coefficient 29*4882a593Smuzhiyun# over code generated with "default" compiler: 30*4882a593Smuzhiyun# 31*4882a593Smuzhiyun# SHA256-hw SHA256(*) SHA512 32*4882a593Smuzhiyun# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) 33*4882a593Smuzhiyun# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) 34*4882a593Smuzhiyun# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) 35*4882a593Smuzhiyun# Denver 2.01 10.5 (+26%) 6.70 (+8%) 36*4882a593Smuzhiyun# X-Gene 20.0 (+100%) 12.8 (+300%(***)) 37*4882a593Smuzhiyun# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) 38*4882a593Smuzhiyun# 39*4882a593Smuzhiyun# (*) Software SHA256 results are of lesser relevance, presented 40*4882a593Smuzhiyun# mostly for informational purposes. 41*4882a593Smuzhiyun# (**) The result is a trade-off: it's possible to improve it by 42*4882a593Smuzhiyun# 10% (or by 1 cycle per round), but at the cost of 20% loss 43*4882a593Smuzhiyun# on Cortex-A53 (or by 4 cycles per round). 44*4882a593Smuzhiyun# (***) Super-impressive coefficients over gcc-generated code are 45*4882a593Smuzhiyun# indication of some compiler "pathology", most notably code 46*4882a593Smuzhiyun# generated with -mgeneral-regs-only is significanty faster 47*4882a593Smuzhiyun# and the gap is only 40-90%. 48*4882a593Smuzhiyun# 49*4882a593Smuzhiyun# October 2016. 50*4882a593Smuzhiyun# 51*4882a593Smuzhiyun# Originally it was reckoned that it makes no sense to implement NEON 52*4882a593Smuzhiyun# version of SHA256 for 64-bit processors. This is because performance 53*4882a593Smuzhiyun# improvement on most wide-spread Cortex-A5x processors was observed 54*4882a593Smuzhiyun# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was 55*4882a593Smuzhiyun# observed that 32-bit NEON SHA256 performs significantly better than 56*4882a593Smuzhiyun# 64-bit scalar version on *some* of the more recent processors. As 57*4882a593Smuzhiyun# result 64-bit NEON version of SHA256 was added to provide best 58*4882a593Smuzhiyun# all-round performance. For example it executes ~30% faster on X-Gene 59*4882a593Smuzhiyun# and Mongoose. [For reference, NEON version of SHA512 is bound to 60*4882a593Smuzhiyun# deliver much less improvement, likely *negative* on Cortex-A5x. 61*4882a593Smuzhiyun# Which is why NEON support is limited to SHA256.] 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun$output=pop; 64*4882a593Smuzhiyun$flavour=pop; 65*4882a593Smuzhiyun 66*4882a593Smuzhiyunif ($flavour && $flavour ne "void") { 67*4882a593Smuzhiyun $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68*4882a593Smuzhiyun ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 69*4882a593Smuzhiyun ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 70*4882a593Smuzhiyun die "can't locate arm-xlate.pl"; 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun open OUT,"| \"$^X\" $xlate $flavour $output"; 73*4882a593Smuzhiyun *STDOUT=*OUT; 74*4882a593Smuzhiyun} else { 75*4882a593Smuzhiyun open STDOUT,">$output"; 76*4882a593Smuzhiyun} 77*4882a593Smuzhiyun 78*4882a593Smuzhiyunif ($output =~ /512/) { 79*4882a593Smuzhiyun $BITS=512; 80*4882a593Smuzhiyun $SZ=8; 81*4882a593Smuzhiyun @Sigma0=(28,34,39); 82*4882a593Smuzhiyun @Sigma1=(14,18,41); 83*4882a593Smuzhiyun @sigma0=(1, 8, 7); 84*4882a593Smuzhiyun @sigma1=(19,61, 6); 85*4882a593Smuzhiyun $rounds=80; 86*4882a593Smuzhiyun $reg_t="x"; 87*4882a593Smuzhiyun} else { 88*4882a593Smuzhiyun $BITS=256; 89*4882a593Smuzhiyun $SZ=4; 90*4882a593Smuzhiyun @Sigma0=( 2,13,22); 91*4882a593Smuzhiyun @Sigma1=( 6,11,25); 92*4882a593Smuzhiyun @sigma0=( 7,18, 3); 93*4882a593Smuzhiyun @sigma1=(17,19,10); 94*4882a593Smuzhiyun $rounds=64; 95*4882a593Smuzhiyun $reg_t="w"; 96*4882a593Smuzhiyun} 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun$func="sha${BITS}_block_data_order"; 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); 101*4882a593Smuzhiyun 102*4882a593Smuzhiyun@X=map("$reg_t$_",(3..15,0..2)); 103*4882a593Smuzhiyun@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); 104*4882a593Smuzhiyun($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); 105*4882a593Smuzhiyun 106*4882a593Smuzhiyunsub BODY_00_xx { 107*4882a593Smuzhiyunmy ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 108*4882a593Smuzhiyunmy $j=($i+1)&15; 109*4882a593Smuzhiyunmy ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); 110*4882a593Smuzhiyun $T0=@X[$i+3] if ($i<11); 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun$code.=<<___ if ($i<16); 113*4882a593Smuzhiyun#ifndef __AARCH64EB__ 114*4882a593Smuzhiyun rev @X[$i],@X[$i] // $i 115*4882a593Smuzhiyun#endif 116*4882a593Smuzhiyun___ 117*4882a593Smuzhiyun$code.=<<___ if ($i<13 && ($i&1)); 118*4882a593Smuzhiyun ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ 119*4882a593Smuzhiyun___ 120*4882a593Smuzhiyun$code.=<<___ if ($i==13); 121*4882a593Smuzhiyun ldp @X[14],@X[15],[$inp] 122*4882a593Smuzhiyun___ 123*4882a593Smuzhiyun$code.=<<___ if ($i>=14); 124*4882a593Smuzhiyun ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] 125*4882a593Smuzhiyun___ 126*4882a593Smuzhiyun$code.=<<___ if ($i>0 && $i<16); 127*4882a593Smuzhiyun add $a,$a,$t1 // h+=Sigma0(a) 128*4882a593Smuzhiyun___ 129*4882a593Smuzhiyun$code.=<<___ if ($i>=11); 130*4882a593Smuzhiyun str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] 131*4882a593Smuzhiyun___ 132*4882a593Smuzhiyun# While ARMv8 specifies merged rotate-n-logical operation such as 133*4882a593Smuzhiyun# 'eor x,y,z,ror#n', it was found to negatively affect performance 134*4882a593Smuzhiyun# on Apple A7. The reason seems to be that it requires even 'y' to 135*4882a593Smuzhiyun# be available earlier. This means that such merged instruction is 136*4882a593Smuzhiyun# not necessarily best choice on critical path... On the other hand 137*4882a593Smuzhiyun# Cortex-A5x handles merged instructions much better than disjoint 138*4882a593Smuzhiyun# rotate and logical... See (**) footnote above. 139*4882a593Smuzhiyun$code.=<<___ if ($i<15); 140*4882a593Smuzhiyun ror $t0,$e,#$Sigma1[0] 141*4882a593Smuzhiyun add $h,$h,$t2 // h+=K[i] 142*4882a593Smuzhiyun eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` 143*4882a593Smuzhiyun and $t1,$f,$e 144*4882a593Smuzhiyun bic $t2,$g,$e 145*4882a593Smuzhiyun add $h,$h,@X[$i&15] // h+=X[i] 146*4882a593Smuzhiyun orr $t1,$t1,$t2 // Ch(e,f,g) 147*4882a593Smuzhiyun eor $t2,$a,$b // a^b, b^c in next round 148*4882a593Smuzhiyun eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) 149*4882a593Smuzhiyun ror $T0,$a,#$Sigma0[0] 150*4882a593Smuzhiyun add $h,$h,$t1 // h+=Ch(e,f,g) 151*4882a593Smuzhiyun eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` 152*4882a593Smuzhiyun add $h,$h,$t0 // h+=Sigma1(e) 153*4882a593Smuzhiyun and $t3,$t3,$t2 // (b^c)&=(a^b) 154*4882a593Smuzhiyun add $d,$d,$h // d+=h 155*4882a593Smuzhiyun eor $t3,$t3,$b // Maj(a,b,c) 156*4882a593Smuzhiyun eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) 157*4882a593Smuzhiyun add $h,$h,$t3 // h+=Maj(a,b,c) 158*4882a593Smuzhiyun ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round 159*4882a593Smuzhiyun //add $h,$h,$t1 // h+=Sigma0(a) 160*4882a593Smuzhiyun___ 161*4882a593Smuzhiyun$code.=<<___ if ($i>=15); 162*4882a593Smuzhiyun ror $t0,$e,#$Sigma1[0] 163*4882a593Smuzhiyun add $h,$h,$t2 // h+=K[i] 164*4882a593Smuzhiyun ror $T1,@X[($j+1)&15],#$sigma0[0] 165*4882a593Smuzhiyun and $t1,$f,$e 166*4882a593Smuzhiyun ror $T2,@X[($j+14)&15],#$sigma1[0] 167*4882a593Smuzhiyun bic $t2,$g,$e 168*4882a593Smuzhiyun ror $T0,$a,#$Sigma0[0] 169*4882a593Smuzhiyun add $h,$h,@X[$i&15] // h+=X[i] 170*4882a593Smuzhiyun eor $t0,$t0,$e,ror#$Sigma1[1] 171*4882a593Smuzhiyun eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] 172*4882a593Smuzhiyun orr $t1,$t1,$t2 // Ch(e,f,g) 173*4882a593Smuzhiyun eor $t2,$a,$b // a^b, b^c in next round 174*4882a593Smuzhiyun eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) 175*4882a593Smuzhiyun eor $T0,$T0,$a,ror#$Sigma0[1] 176*4882a593Smuzhiyun add $h,$h,$t1 // h+=Ch(e,f,g) 177*4882a593Smuzhiyun and $t3,$t3,$t2 // (b^c)&=(a^b) 178*4882a593Smuzhiyun eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] 179*4882a593Smuzhiyun eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) 180*4882a593Smuzhiyun add $h,$h,$t0 // h+=Sigma1(e) 181*4882a593Smuzhiyun eor $t3,$t3,$b // Maj(a,b,c) 182*4882a593Smuzhiyun eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) 183*4882a593Smuzhiyun eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) 184*4882a593Smuzhiyun add @X[$j],@X[$j],@X[($j+9)&15] 185*4882a593Smuzhiyun add $d,$d,$h // d+=h 186*4882a593Smuzhiyun add $h,$h,$t3 // h+=Maj(a,b,c) 187*4882a593Smuzhiyun ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round 188*4882a593Smuzhiyun add @X[$j],@X[$j],$T1 189*4882a593Smuzhiyun add $h,$h,$t1 // h+=Sigma0(a) 190*4882a593Smuzhiyun add @X[$j],@X[$j],$T2 191*4882a593Smuzhiyun___ 192*4882a593Smuzhiyun ($t2,$t3)=($t3,$t2); 193*4882a593Smuzhiyun} 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun$code.=<<___; 196*4882a593Smuzhiyun#ifndef __KERNEL__ 197*4882a593Smuzhiyun# include "arm_arch.h" 198*4882a593Smuzhiyun#endif 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun.text 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun.extern OPENSSL_armcap_P 203*4882a593Smuzhiyun.globl $func 204*4882a593Smuzhiyun.type $func,%function 205*4882a593Smuzhiyun.align 6 206*4882a593Smuzhiyun$func: 207*4882a593Smuzhiyun___ 208*4882a593Smuzhiyun$code.=<<___ if ($SZ==4); 209*4882a593Smuzhiyun#ifndef __KERNEL__ 210*4882a593Smuzhiyun# ifdef __ILP32__ 211*4882a593Smuzhiyun ldrsw x16,.LOPENSSL_armcap_P 212*4882a593Smuzhiyun# else 213*4882a593Smuzhiyun ldr x16,.LOPENSSL_armcap_P 214*4882a593Smuzhiyun# endif 215*4882a593Smuzhiyun adr x17,.LOPENSSL_armcap_P 216*4882a593Smuzhiyun add x16,x16,x17 217*4882a593Smuzhiyun ldr w16,[x16] 218*4882a593Smuzhiyun tst w16,#ARMV8_SHA256 219*4882a593Smuzhiyun b.ne .Lv8_entry 220*4882a593Smuzhiyun tst w16,#ARMV7_NEON 221*4882a593Smuzhiyun b.ne .Lneon_entry 222*4882a593Smuzhiyun#endif 223*4882a593Smuzhiyun___ 224*4882a593Smuzhiyun$code.=<<___; 225*4882a593Smuzhiyun stp x29,x30,[sp,#-128]! 226*4882a593Smuzhiyun add x29,sp,#0 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun stp x19,x20,[sp,#16] 229*4882a593Smuzhiyun stp x21,x22,[sp,#32] 230*4882a593Smuzhiyun stp x23,x24,[sp,#48] 231*4882a593Smuzhiyun stp x25,x26,[sp,#64] 232*4882a593Smuzhiyun stp x27,x28,[sp,#80] 233*4882a593Smuzhiyun sub sp,sp,#4*$SZ 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun ldp $A,$B,[$ctx] // load context 236*4882a593Smuzhiyun ldp $C,$D,[$ctx,#2*$SZ] 237*4882a593Smuzhiyun ldp $E,$F,[$ctx,#4*$SZ] 238*4882a593Smuzhiyun add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input 239*4882a593Smuzhiyun ldp $G,$H,[$ctx,#6*$SZ] 240*4882a593Smuzhiyun adr $Ktbl,.LK$BITS 241*4882a593Smuzhiyun stp $ctx,$num,[x29,#96] 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun.Loop: 244*4882a593Smuzhiyun ldp @X[0],@X[1],[$inp],#2*$SZ 245*4882a593Smuzhiyun ldr $t2,[$Ktbl],#$SZ // *K++ 246*4882a593Smuzhiyun eor $t3,$B,$C // magic seed 247*4882a593Smuzhiyun str $inp,[x29,#112] 248*4882a593Smuzhiyun___ 249*4882a593Smuzhiyunfor ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } 250*4882a593Smuzhiyun$code.=".Loop_16_xx:\n"; 251*4882a593Smuzhiyunfor (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } 252*4882a593Smuzhiyun$code.=<<___; 253*4882a593Smuzhiyun cbnz $t2,.Loop_16_xx 254*4882a593Smuzhiyun 255*4882a593Smuzhiyun ldp $ctx,$num,[x29,#96] 256*4882a593Smuzhiyun ldr $inp,[x29,#112] 257*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun ldp @X[0],@X[1],[$ctx] 260*4882a593Smuzhiyun ldp @X[2],@X[3],[$ctx,#2*$SZ] 261*4882a593Smuzhiyun add $inp,$inp,#14*$SZ // advance input pointer 262*4882a593Smuzhiyun ldp @X[4],@X[5],[$ctx,#4*$SZ] 263*4882a593Smuzhiyun add $A,$A,@X[0] 264*4882a593Smuzhiyun ldp @X[6],@X[7],[$ctx,#6*$SZ] 265*4882a593Smuzhiyun add $B,$B,@X[1] 266*4882a593Smuzhiyun add $C,$C,@X[2] 267*4882a593Smuzhiyun add $D,$D,@X[3] 268*4882a593Smuzhiyun stp $A,$B,[$ctx] 269*4882a593Smuzhiyun add $E,$E,@X[4] 270*4882a593Smuzhiyun add $F,$F,@X[5] 271*4882a593Smuzhiyun stp $C,$D,[$ctx,#2*$SZ] 272*4882a593Smuzhiyun add $G,$G,@X[6] 273*4882a593Smuzhiyun add $H,$H,@X[7] 274*4882a593Smuzhiyun cmp $inp,$num 275*4882a593Smuzhiyun stp $E,$F,[$ctx,#4*$SZ] 276*4882a593Smuzhiyun stp $G,$H,[$ctx,#6*$SZ] 277*4882a593Smuzhiyun b.ne .Loop 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun ldp x19,x20,[x29,#16] 280*4882a593Smuzhiyun add sp,sp,#4*$SZ 281*4882a593Smuzhiyun ldp x21,x22,[x29,#32] 282*4882a593Smuzhiyun ldp x23,x24,[x29,#48] 283*4882a593Smuzhiyun ldp x25,x26,[x29,#64] 284*4882a593Smuzhiyun ldp x27,x28,[x29,#80] 285*4882a593Smuzhiyun ldp x29,x30,[sp],#128 286*4882a593Smuzhiyun ret 287*4882a593Smuzhiyun.size $func,.-$func 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun.align 6 290*4882a593Smuzhiyun.type .LK$BITS,%object 291*4882a593Smuzhiyun.LK$BITS: 292*4882a593Smuzhiyun___ 293*4882a593Smuzhiyun$code.=<<___ if ($SZ==8); 294*4882a593Smuzhiyun .quad 0x428a2f98d728ae22,0x7137449123ef65cd 295*4882a593Smuzhiyun .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 296*4882a593Smuzhiyun .quad 0x3956c25bf348b538,0x59f111f1b605d019 297*4882a593Smuzhiyun .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 298*4882a593Smuzhiyun .quad 0xd807aa98a3030242,0x12835b0145706fbe 299*4882a593Smuzhiyun .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 300*4882a593Smuzhiyun .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 301*4882a593Smuzhiyun .quad 0x9bdc06a725c71235,0xc19bf174cf692694 302*4882a593Smuzhiyun .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 303*4882a593Smuzhiyun .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 304*4882a593Smuzhiyun .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 305*4882a593Smuzhiyun .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 306*4882a593Smuzhiyun .quad 0x983e5152ee66dfab,0xa831c66d2db43210 307*4882a593Smuzhiyun .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 308*4882a593Smuzhiyun .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 309*4882a593Smuzhiyun .quad 0x06ca6351e003826f,0x142929670a0e6e70 310*4882a593Smuzhiyun .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 311*4882a593Smuzhiyun .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 312*4882a593Smuzhiyun .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 313*4882a593Smuzhiyun .quad 0x81c2c92e47edaee6,0x92722c851482353b 314*4882a593Smuzhiyun .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 315*4882a593Smuzhiyun .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 316*4882a593Smuzhiyun .quad 0xd192e819d6ef5218,0xd69906245565a910 317*4882a593Smuzhiyun .quad 0xf40e35855771202a,0x106aa07032bbd1b8 318*4882a593Smuzhiyun .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 319*4882a593Smuzhiyun .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 320*4882a593Smuzhiyun .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 321*4882a593Smuzhiyun .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 322*4882a593Smuzhiyun .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 323*4882a593Smuzhiyun .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 324*4882a593Smuzhiyun .quad 0x90befffa23631e28,0xa4506cebde82bde9 325*4882a593Smuzhiyun .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 326*4882a593Smuzhiyun .quad 0xca273eceea26619c,0xd186b8c721c0c207 327*4882a593Smuzhiyun .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 328*4882a593Smuzhiyun .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 329*4882a593Smuzhiyun .quad 0x113f9804bef90dae,0x1b710b35131c471b 330*4882a593Smuzhiyun .quad 0x28db77f523047d84,0x32caab7b40c72493 331*4882a593Smuzhiyun .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 332*4882a593Smuzhiyun .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 333*4882a593Smuzhiyun .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 334*4882a593Smuzhiyun .quad 0 // terminator 335*4882a593Smuzhiyun___ 336*4882a593Smuzhiyun$code.=<<___ if ($SZ==4); 337*4882a593Smuzhiyun .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 338*4882a593Smuzhiyun .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 339*4882a593Smuzhiyun .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 340*4882a593Smuzhiyun .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 341*4882a593Smuzhiyun .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 342*4882a593Smuzhiyun .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 343*4882a593Smuzhiyun .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 344*4882a593Smuzhiyun .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 345*4882a593Smuzhiyun .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 346*4882a593Smuzhiyun .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 347*4882a593Smuzhiyun .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 348*4882a593Smuzhiyun .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 349*4882a593Smuzhiyun .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 350*4882a593Smuzhiyun .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 351*4882a593Smuzhiyun .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 352*4882a593Smuzhiyun .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 353*4882a593Smuzhiyun .long 0 //terminator 354*4882a593Smuzhiyun___ 355*4882a593Smuzhiyun$code.=<<___; 356*4882a593Smuzhiyun.size .LK$BITS,.-.LK$BITS 357*4882a593Smuzhiyun#ifndef __KERNEL__ 358*4882a593Smuzhiyun.align 3 359*4882a593Smuzhiyun.LOPENSSL_armcap_P: 360*4882a593Smuzhiyun# ifdef __ILP32__ 361*4882a593Smuzhiyun .long OPENSSL_armcap_P-. 362*4882a593Smuzhiyun# else 363*4882a593Smuzhiyun .quad OPENSSL_armcap_P-. 364*4882a593Smuzhiyun# endif 365*4882a593Smuzhiyun#endif 366*4882a593Smuzhiyun.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 367*4882a593Smuzhiyun.align 2 368*4882a593Smuzhiyun___ 369*4882a593Smuzhiyun 370*4882a593Smuzhiyunif ($SZ==4) { 371*4882a593Smuzhiyunmy $Ktbl="x3"; 372*4882a593Smuzhiyun 373*4882a593Smuzhiyunmy ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); 374*4882a593Smuzhiyunmy @MSG=map("v$_.16b",(4..7)); 375*4882a593Smuzhiyunmy ($W0,$W1)=("v16.4s","v17.4s"); 376*4882a593Smuzhiyunmy ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun$code.=<<___; 379*4882a593Smuzhiyun#ifndef __KERNEL__ 380*4882a593Smuzhiyun.type sha256_block_armv8,%function 381*4882a593Smuzhiyun.align 6 382*4882a593Smuzhiyunsha256_block_armv8: 383*4882a593Smuzhiyun.Lv8_entry: 384*4882a593Smuzhiyun stp x29,x30,[sp,#-16]! 385*4882a593Smuzhiyun add x29,sp,#0 386*4882a593Smuzhiyun 387*4882a593Smuzhiyun ld1.32 {$ABCD,$EFGH},[$ctx] 388*4882a593Smuzhiyun adr $Ktbl,.LK256 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun.Loop_hw: 391*4882a593Smuzhiyun ld1 {@MSG[0]-@MSG[3]},[$inp],#64 392*4882a593Smuzhiyun sub $num,$num,#1 393*4882a593Smuzhiyun ld1.32 {$W0},[$Ktbl],#16 394*4882a593Smuzhiyun rev32 @MSG[0],@MSG[0] 395*4882a593Smuzhiyun rev32 @MSG[1],@MSG[1] 396*4882a593Smuzhiyun rev32 @MSG[2],@MSG[2] 397*4882a593Smuzhiyun rev32 @MSG[3],@MSG[3] 398*4882a593Smuzhiyun orr $ABCD_SAVE,$ABCD,$ABCD // offload 399*4882a593Smuzhiyun orr $EFGH_SAVE,$EFGH,$EFGH 400*4882a593Smuzhiyun___ 401*4882a593Smuzhiyunfor($i=0;$i<12;$i++) { 402*4882a593Smuzhiyun$code.=<<___; 403*4882a593Smuzhiyun ld1.32 {$W1},[$Ktbl],#16 404*4882a593Smuzhiyun add.i32 $W0,$W0,@MSG[0] 405*4882a593Smuzhiyun sha256su0 @MSG[0],@MSG[1] 406*4882a593Smuzhiyun orr $abcd,$ABCD,$ABCD 407*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W0 408*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W0 409*4882a593Smuzhiyun sha256su1 @MSG[0],@MSG[2],@MSG[3] 410*4882a593Smuzhiyun___ 411*4882a593Smuzhiyun ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 412*4882a593Smuzhiyun} 413*4882a593Smuzhiyun$code.=<<___; 414*4882a593Smuzhiyun ld1.32 {$W1},[$Ktbl],#16 415*4882a593Smuzhiyun add.i32 $W0,$W0,@MSG[0] 416*4882a593Smuzhiyun orr $abcd,$ABCD,$ABCD 417*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W0 418*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W0 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun ld1.32 {$W0},[$Ktbl],#16 421*4882a593Smuzhiyun add.i32 $W1,$W1,@MSG[1] 422*4882a593Smuzhiyun orr $abcd,$ABCD,$ABCD 423*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W1 424*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W1 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun ld1.32 {$W1},[$Ktbl] 427*4882a593Smuzhiyun add.i32 $W0,$W0,@MSG[2] 428*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind 429*4882a593Smuzhiyun orr $abcd,$ABCD,$ABCD 430*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W0 431*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W0 432*4882a593Smuzhiyun 433*4882a593Smuzhiyun add.i32 $W1,$W1,@MSG[3] 434*4882a593Smuzhiyun orr $abcd,$ABCD,$ABCD 435*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W1 436*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W1 437*4882a593Smuzhiyun 438*4882a593Smuzhiyun add.i32 $ABCD,$ABCD,$ABCD_SAVE 439*4882a593Smuzhiyun add.i32 $EFGH,$EFGH,$EFGH_SAVE 440*4882a593Smuzhiyun 441*4882a593Smuzhiyun cbnz $num,.Loop_hw 442*4882a593Smuzhiyun 443*4882a593Smuzhiyun st1.32 {$ABCD,$EFGH},[$ctx] 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun ldr x29,[sp],#16 446*4882a593Smuzhiyun ret 447*4882a593Smuzhiyun.size sha256_block_armv8,.-sha256_block_armv8 448*4882a593Smuzhiyun#endif 449*4882a593Smuzhiyun___ 450*4882a593Smuzhiyun} 451*4882a593Smuzhiyun 452*4882a593Smuzhiyunif ($SZ==4) { ######################################### NEON stuff # 453*4882a593Smuzhiyun# You'll surely note a lot of similarities with sha256-armv4 module, 454*4882a593Smuzhiyun# and of course it's not a coincidence. sha256-armv4 was used as 455*4882a593Smuzhiyun# initial template, but was adapted for ARMv8 instruction set and 456*4882a593Smuzhiyun# extensively re-tuned for all-round performance. 457*4882a593Smuzhiyun 458*4882a593Smuzhiyunmy @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); 459*4882a593Smuzhiyunmy ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); 460*4882a593Smuzhiyunmy $Ktbl="x16"; 461*4882a593Smuzhiyunmy $Xfer="x17"; 462*4882a593Smuzhiyunmy @X = map("q$_",(0..3)); 463*4882a593Smuzhiyunmy ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); 464*4882a593Smuzhiyunmy $j=0; 465*4882a593Smuzhiyun 466*4882a593Smuzhiyunsub AUTOLOAD() # thunk [simplified] x86-style perlasm 467*4882a593Smuzhiyun{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 468*4882a593Smuzhiyun my $arg = pop; 469*4882a593Smuzhiyun $arg = "#$arg" if ($arg*1 eq $arg); 470*4882a593Smuzhiyun $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 471*4882a593Smuzhiyun} 472*4882a593Smuzhiyun 473*4882a593Smuzhiyunsub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } 474*4882a593Smuzhiyunsub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } 475*4882a593Smuzhiyunsub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } 476*4882a593Smuzhiyun 477*4882a593Smuzhiyunsub Xupdate() 478*4882a593Smuzhiyun{ use integer; 479*4882a593Smuzhiyun my $body = shift; 480*4882a593Smuzhiyun my @insns = (&$body,&$body,&$body,&$body); 481*4882a593Smuzhiyun my ($a,$b,$c,$d,$e,$f,$g,$h); 482*4882a593Smuzhiyun 483*4882a593Smuzhiyun &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] 484*4882a593Smuzhiyun eval(shift(@insns)); 485*4882a593Smuzhiyun eval(shift(@insns)); 486*4882a593Smuzhiyun eval(shift(@insns)); 487*4882a593Smuzhiyun &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] 488*4882a593Smuzhiyun eval(shift(@insns)); 489*4882a593Smuzhiyun eval(shift(@insns)); 490*4882a593Smuzhiyun &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] 491*4882a593Smuzhiyun eval(shift(@insns)); 492*4882a593Smuzhiyun eval(shift(@insns)); 493*4882a593Smuzhiyun &ushr_32 ($T2,$T0,$sigma0[0]); 494*4882a593Smuzhiyun eval(shift(@insns)); 495*4882a593Smuzhiyun &ushr_32 ($T1,$T0,$sigma0[2]); 496*4882a593Smuzhiyun eval(shift(@insns)); 497*4882a593Smuzhiyun &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] 498*4882a593Smuzhiyun eval(shift(@insns)); 499*4882a593Smuzhiyun &sli_32 ($T2,$T0,32-$sigma0[0]); 500*4882a593Smuzhiyun eval(shift(@insns)); 501*4882a593Smuzhiyun eval(shift(@insns)); 502*4882a593Smuzhiyun &ushr_32 ($T3,$T0,$sigma0[1]); 503*4882a593Smuzhiyun eval(shift(@insns)); 504*4882a593Smuzhiyun eval(shift(@insns)); 505*4882a593Smuzhiyun &eor_8 ($T1,$T1,$T2); 506*4882a593Smuzhiyun eval(shift(@insns)); 507*4882a593Smuzhiyun eval(shift(@insns)); 508*4882a593Smuzhiyun &sli_32 ($T3,$T0,32-$sigma0[1]); 509*4882a593Smuzhiyun eval(shift(@insns)); 510*4882a593Smuzhiyun eval(shift(@insns)); 511*4882a593Smuzhiyun &ushr_32 ($T4,$T7,$sigma1[0]); 512*4882a593Smuzhiyun eval(shift(@insns)); 513*4882a593Smuzhiyun eval(shift(@insns)); 514*4882a593Smuzhiyun &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) 515*4882a593Smuzhiyun eval(shift(@insns)); 516*4882a593Smuzhiyun eval(shift(@insns)); 517*4882a593Smuzhiyun &sli_32 ($T4,$T7,32-$sigma1[0]); 518*4882a593Smuzhiyun eval(shift(@insns)); 519*4882a593Smuzhiyun eval(shift(@insns)); 520*4882a593Smuzhiyun &ushr_32 ($T5,$T7,$sigma1[2]); 521*4882a593Smuzhiyun eval(shift(@insns)); 522*4882a593Smuzhiyun eval(shift(@insns)); 523*4882a593Smuzhiyun &ushr_32 ($T3,$T7,$sigma1[1]); 524*4882a593Smuzhiyun eval(shift(@insns)); 525*4882a593Smuzhiyun eval(shift(@insns)); 526*4882a593Smuzhiyun &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 527*4882a593Smuzhiyun eval(shift(@insns)); 528*4882a593Smuzhiyun eval(shift(@insns)); 529*4882a593Smuzhiyun &sli_u32 ($T3,$T7,32-$sigma1[1]); 530*4882a593Smuzhiyun eval(shift(@insns)); 531*4882a593Smuzhiyun eval(shift(@insns)); 532*4882a593Smuzhiyun &eor_8 ($T5,$T5,$T4); 533*4882a593Smuzhiyun eval(shift(@insns)); 534*4882a593Smuzhiyun eval(shift(@insns)); 535*4882a593Smuzhiyun eval(shift(@insns)); 536*4882a593Smuzhiyun &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) 537*4882a593Smuzhiyun eval(shift(@insns)); 538*4882a593Smuzhiyun eval(shift(@insns)); 539*4882a593Smuzhiyun eval(shift(@insns)); 540*4882a593Smuzhiyun &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) 541*4882a593Smuzhiyun eval(shift(@insns)); 542*4882a593Smuzhiyun eval(shift(@insns)); 543*4882a593Smuzhiyun eval(shift(@insns)); 544*4882a593Smuzhiyun &ushr_32 ($T6,@X[0],$sigma1[0]); 545*4882a593Smuzhiyun eval(shift(@insns)); 546*4882a593Smuzhiyun &ushr_32 ($T7,@X[0],$sigma1[2]); 547*4882a593Smuzhiyun eval(shift(@insns)); 548*4882a593Smuzhiyun eval(shift(@insns)); 549*4882a593Smuzhiyun &sli_32 ($T6,@X[0],32-$sigma1[0]); 550*4882a593Smuzhiyun eval(shift(@insns)); 551*4882a593Smuzhiyun &ushr_32 ($T5,@X[0],$sigma1[1]); 552*4882a593Smuzhiyun eval(shift(@insns)); 553*4882a593Smuzhiyun eval(shift(@insns)); 554*4882a593Smuzhiyun &eor_8 ($T7,$T7,$T6); 555*4882a593Smuzhiyun eval(shift(@insns)); 556*4882a593Smuzhiyun eval(shift(@insns)); 557*4882a593Smuzhiyun &sli_32 ($T5,@X[0],32-$sigma1[1]); 558*4882a593Smuzhiyun eval(shift(@insns)); 559*4882a593Smuzhiyun eval(shift(@insns)); 560*4882a593Smuzhiyun &ld1_32 ("{$T0}","[$Ktbl], #16"); 561*4882a593Smuzhiyun eval(shift(@insns)); 562*4882a593Smuzhiyun &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) 563*4882a593Smuzhiyun eval(shift(@insns)); 564*4882a593Smuzhiyun eval(shift(@insns)); 565*4882a593Smuzhiyun &eor_8 ($T5,$T5,$T5); 566*4882a593Smuzhiyun eval(shift(@insns)); 567*4882a593Smuzhiyun eval(shift(@insns)); 568*4882a593Smuzhiyun &mov (&Dhi($T5), &Dlo($T7)); 569*4882a593Smuzhiyun eval(shift(@insns)); 570*4882a593Smuzhiyun eval(shift(@insns)); 571*4882a593Smuzhiyun eval(shift(@insns)); 572*4882a593Smuzhiyun &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) 573*4882a593Smuzhiyun eval(shift(@insns)); 574*4882a593Smuzhiyun eval(shift(@insns)); 575*4882a593Smuzhiyun eval(shift(@insns)); 576*4882a593Smuzhiyun &add_32 ($T0,$T0,@X[0]); 577*4882a593Smuzhiyun while($#insns>=1) { eval(shift(@insns)); } 578*4882a593Smuzhiyun &st1_32 ("{$T0}","[$Xfer], #16"); 579*4882a593Smuzhiyun eval(shift(@insns)); 580*4882a593Smuzhiyun 581*4882a593Smuzhiyun push(@X,shift(@X)); # "rotate" X[] 582*4882a593Smuzhiyun} 583*4882a593Smuzhiyun 584*4882a593Smuzhiyunsub Xpreload() 585*4882a593Smuzhiyun{ use integer; 586*4882a593Smuzhiyun my $body = shift; 587*4882a593Smuzhiyun my @insns = (&$body,&$body,&$body,&$body); 588*4882a593Smuzhiyun my ($a,$b,$c,$d,$e,$f,$g,$h); 589*4882a593Smuzhiyun 590*4882a593Smuzhiyun eval(shift(@insns)); 591*4882a593Smuzhiyun eval(shift(@insns)); 592*4882a593Smuzhiyun &ld1_8 ("{@X[0]}","[$inp],#16"); 593*4882a593Smuzhiyun eval(shift(@insns)); 594*4882a593Smuzhiyun eval(shift(@insns)); 595*4882a593Smuzhiyun &ld1_32 ("{$T0}","[$Ktbl],#16"); 596*4882a593Smuzhiyun eval(shift(@insns)); 597*4882a593Smuzhiyun eval(shift(@insns)); 598*4882a593Smuzhiyun eval(shift(@insns)); 599*4882a593Smuzhiyun eval(shift(@insns)); 600*4882a593Smuzhiyun &rev32 (@X[0],@X[0]); 601*4882a593Smuzhiyun eval(shift(@insns)); 602*4882a593Smuzhiyun eval(shift(@insns)); 603*4882a593Smuzhiyun eval(shift(@insns)); 604*4882a593Smuzhiyun eval(shift(@insns)); 605*4882a593Smuzhiyun &add_32 ($T0,$T0,@X[0]); 606*4882a593Smuzhiyun foreach (@insns) { eval; } # remaining instructions 607*4882a593Smuzhiyun &st1_32 ("{$T0}","[$Xfer], #16"); 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun push(@X,shift(@X)); # "rotate" X[] 610*4882a593Smuzhiyun} 611*4882a593Smuzhiyun 612*4882a593Smuzhiyunsub body_00_15 () { 613*4882a593Smuzhiyun ( 614*4882a593Smuzhiyun '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 615*4882a593Smuzhiyun '&add ($h,$h,$t1)', # h+=X[i]+K[i] 616*4882a593Smuzhiyun '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past 617*4882a593Smuzhiyun '&and ($t1,$f,$e)', 618*4882a593Smuzhiyun '&bic ($t4,$g,$e)', 619*4882a593Smuzhiyun '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 620*4882a593Smuzhiyun '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 621*4882a593Smuzhiyun '&orr ($t1,$t1,$t4)', # Ch(e,f,g) 622*4882a593Smuzhiyun '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 623*4882a593Smuzhiyun '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 624*4882a593Smuzhiyun '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 625*4882a593Smuzhiyun '&ror ($t0,$t0,"#$Sigma1[0]")', 626*4882a593Smuzhiyun '&eor ($t2,$a,$b)', # a^b, b^c in next round 627*4882a593Smuzhiyun '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 628*4882a593Smuzhiyun '&add ($h,$h,$t0)', # h+=Sigma1(e) 629*4882a593Smuzhiyun '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 630*4882a593Smuzhiyun '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 631*4882a593Smuzhiyun '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 632*4882a593Smuzhiyun '&ror ($t4,$t4,"#$Sigma0[0]")', 633*4882a593Smuzhiyun '&add ($d,$d,$h)', # d+=h 634*4882a593Smuzhiyun '&eor ($t3,$t3,$b)', # Maj(a,b,c) 635*4882a593Smuzhiyun '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 636*4882a593Smuzhiyun ) 637*4882a593Smuzhiyun} 638*4882a593Smuzhiyun 639*4882a593Smuzhiyun$code.=<<___; 640*4882a593Smuzhiyun#ifdef __KERNEL__ 641*4882a593Smuzhiyun.globl sha256_block_neon 642*4882a593Smuzhiyun#endif 643*4882a593Smuzhiyun.type sha256_block_neon,%function 644*4882a593Smuzhiyun.align 4 645*4882a593Smuzhiyunsha256_block_neon: 646*4882a593Smuzhiyun.Lneon_entry: 647*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 648*4882a593Smuzhiyun mov x29, sp 649*4882a593Smuzhiyun sub sp,sp,#16*4 650*4882a593Smuzhiyun 651*4882a593Smuzhiyun adr $Ktbl,.LK256 652*4882a593Smuzhiyun add $num,$inp,$num,lsl#6 // len to point at the end of inp 653*4882a593Smuzhiyun 654*4882a593Smuzhiyun ld1.8 {@X[0]},[$inp], #16 655*4882a593Smuzhiyun ld1.8 {@X[1]},[$inp], #16 656*4882a593Smuzhiyun ld1.8 {@X[2]},[$inp], #16 657*4882a593Smuzhiyun ld1.8 {@X[3]},[$inp], #16 658*4882a593Smuzhiyun ld1.32 {$T0},[$Ktbl], #16 659*4882a593Smuzhiyun ld1.32 {$T1},[$Ktbl], #16 660*4882a593Smuzhiyun ld1.32 {$T2},[$Ktbl], #16 661*4882a593Smuzhiyun ld1.32 {$T3},[$Ktbl], #16 662*4882a593Smuzhiyun rev32 @X[0],@X[0] // yes, even on 663*4882a593Smuzhiyun rev32 @X[1],@X[1] // big-endian 664*4882a593Smuzhiyun rev32 @X[2],@X[2] 665*4882a593Smuzhiyun rev32 @X[3],@X[3] 666*4882a593Smuzhiyun mov $Xfer,sp 667*4882a593Smuzhiyun add.32 $T0,$T0,@X[0] 668*4882a593Smuzhiyun add.32 $T1,$T1,@X[1] 669*4882a593Smuzhiyun add.32 $T2,$T2,@X[2] 670*4882a593Smuzhiyun st1.32 {$T0-$T1},[$Xfer], #32 671*4882a593Smuzhiyun add.32 $T3,$T3,@X[3] 672*4882a593Smuzhiyun st1.32 {$T2-$T3},[$Xfer] 673*4882a593Smuzhiyun sub $Xfer,$Xfer,#32 674*4882a593Smuzhiyun 675*4882a593Smuzhiyun ldp $A,$B,[$ctx] 676*4882a593Smuzhiyun ldp $C,$D,[$ctx,#8] 677*4882a593Smuzhiyun ldp $E,$F,[$ctx,#16] 678*4882a593Smuzhiyun ldp $G,$H,[$ctx,#24] 679*4882a593Smuzhiyun ldr $t1,[sp,#0] 680*4882a593Smuzhiyun mov $t2,wzr 681*4882a593Smuzhiyun eor $t3,$B,$C 682*4882a593Smuzhiyun mov $t4,wzr 683*4882a593Smuzhiyun b .L_00_48 684*4882a593Smuzhiyun 685*4882a593Smuzhiyun.align 4 686*4882a593Smuzhiyun.L_00_48: 687*4882a593Smuzhiyun___ 688*4882a593Smuzhiyun &Xupdate(\&body_00_15); 689*4882a593Smuzhiyun &Xupdate(\&body_00_15); 690*4882a593Smuzhiyun &Xupdate(\&body_00_15); 691*4882a593Smuzhiyun &Xupdate(\&body_00_15); 692*4882a593Smuzhiyun$code.=<<___; 693*4882a593Smuzhiyun cmp $t1,#0 // check for K256 terminator 694*4882a593Smuzhiyun ldr $t1,[sp,#0] 695*4882a593Smuzhiyun sub $Xfer,$Xfer,#64 696*4882a593Smuzhiyun bne .L_00_48 697*4882a593Smuzhiyun 698*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#256 // rewind $Ktbl 699*4882a593Smuzhiyun cmp $inp,$num 700*4882a593Smuzhiyun mov $Xfer, #64 701*4882a593Smuzhiyun csel $Xfer, $Xfer, xzr, eq 702*4882a593Smuzhiyun sub $inp,$inp,$Xfer // avoid SEGV 703*4882a593Smuzhiyun mov $Xfer,sp 704*4882a593Smuzhiyun___ 705*4882a593Smuzhiyun &Xpreload(\&body_00_15); 706*4882a593Smuzhiyun &Xpreload(\&body_00_15); 707*4882a593Smuzhiyun &Xpreload(\&body_00_15); 708*4882a593Smuzhiyun &Xpreload(\&body_00_15); 709*4882a593Smuzhiyun$code.=<<___; 710*4882a593Smuzhiyun add $A,$A,$t4 // h+=Sigma0(a) from the past 711*4882a593Smuzhiyun ldp $t0,$t1,[$ctx,#0] 712*4882a593Smuzhiyun add $A,$A,$t2 // h+=Maj(a,b,c) from the past 713*4882a593Smuzhiyun ldp $t2,$t3,[$ctx,#8] 714*4882a593Smuzhiyun add $A,$A,$t0 // accumulate 715*4882a593Smuzhiyun add $B,$B,$t1 716*4882a593Smuzhiyun ldp $t0,$t1,[$ctx,#16] 717*4882a593Smuzhiyun add $C,$C,$t2 718*4882a593Smuzhiyun add $D,$D,$t3 719*4882a593Smuzhiyun ldp $t2,$t3,[$ctx,#24] 720*4882a593Smuzhiyun add $E,$E,$t0 721*4882a593Smuzhiyun add $F,$F,$t1 722*4882a593Smuzhiyun ldr $t1,[sp,#0] 723*4882a593Smuzhiyun stp $A,$B,[$ctx,#0] 724*4882a593Smuzhiyun add $G,$G,$t2 725*4882a593Smuzhiyun mov $t2,wzr 726*4882a593Smuzhiyun stp $C,$D,[$ctx,#8] 727*4882a593Smuzhiyun add $H,$H,$t3 728*4882a593Smuzhiyun stp $E,$F,[$ctx,#16] 729*4882a593Smuzhiyun eor $t3,$B,$C 730*4882a593Smuzhiyun stp $G,$H,[$ctx,#24] 731*4882a593Smuzhiyun mov $t4,wzr 732*4882a593Smuzhiyun mov $Xfer,sp 733*4882a593Smuzhiyun b.ne .L_00_48 734*4882a593Smuzhiyun 735*4882a593Smuzhiyun ldr x29,[x29] 736*4882a593Smuzhiyun add sp,sp,#16*4+16 737*4882a593Smuzhiyun ret 738*4882a593Smuzhiyun.size sha256_block_neon,.-sha256_block_neon 739*4882a593Smuzhiyun___ 740*4882a593Smuzhiyun} 741*4882a593Smuzhiyun 742*4882a593Smuzhiyun$code.=<<___; 743*4882a593Smuzhiyun#ifndef __KERNEL__ 744*4882a593Smuzhiyun.comm OPENSSL_armcap_P,4,4 745*4882a593Smuzhiyun#endif 746*4882a593Smuzhiyun___ 747*4882a593Smuzhiyun 748*4882a593Smuzhiyun{ my %opcode = ( 749*4882a593Smuzhiyun "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, 750*4882a593Smuzhiyun "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); 751*4882a593Smuzhiyun 752*4882a593Smuzhiyun sub unsha256 { 753*4882a593Smuzhiyun my ($mnemonic,$arg)=@_; 754*4882a593Smuzhiyun 755*4882a593Smuzhiyun $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o 756*4882a593Smuzhiyun && 757*4882a593Smuzhiyun sprintf ".inst\t0x%08x\t//%s %s", 758*4882a593Smuzhiyun $opcode{$mnemonic}|$1|($2<<5)|($3<<16), 759*4882a593Smuzhiyun $mnemonic,$arg; 760*4882a593Smuzhiyun } 761*4882a593Smuzhiyun} 762*4882a593Smuzhiyun 763*4882a593Smuzhiyunopen SELF,$0; 764*4882a593Smuzhiyunwhile(<SELF>) { 765*4882a593Smuzhiyun next if (/^#!/); 766*4882a593Smuzhiyun last if (!s/^#/\/\// and !/^$/); 767*4882a593Smuzhiyun print; 768*4882a593Smuzhiyun} 769*4882a593Smuzhiyunclose SELF; 770*4882a593Smuzhiyun 771*4882a593Smuzhiyunforeach(split("\n",$code)) { 772*4882a593Smuzhiyun 773*4882a593Smuzhiyun s/\`([^\`]*)\`/eval($1)/ge; 774*4882a593Smuzhiyun 775*4882a593Smuzhiyun s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; 776*4882a593Smuzhiyun 777*4882a593Smuzhiyun s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers 778*4882a593Smuzhiyun 779*4882a593Smuzhiyun s/\.[ui]?8(\s)/$1/; 780*4882a593Smuzhiyun s/\.\w?32\b// and s/\.16b/\.4s/g; 781*4882a593Smuzhiyun m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; 782*4882a593Smuzhiyun 783*4882a593Smuzhiyun print $_,"\n"; 784*4882a593Smuzhiyun} 785*4882a593Smuzhiyun 786*4882a593Smuzhiyunclose STDOUT; 787