1*4882a593Smuzhiyun#!/usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5*4882a593Smuzhiyun# has relicensed it under the GPLv2. Therefore this program is free software; 6*4882a593Smuzhiyun# you can redistribute it and/or modify it under the terms of the GNU General 7*4882a593Smuzhiyun# Public License version 2 as published by the Free Software Foundation. 8*4882a593Smuzhiyun# 9*4882a593Smuzhiyun# The original headers, including the original license headers, are 10*4882a593Smuzhiyun# included below for completeness. 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun# ==================================================================== 13*4882a593Smuzhiyun# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14*4882a593Smuzhiyun# project. The module is, however, dual licensed under OpenSSL and 15*4882a593Smuzhiyun# CRYPTOGAMS licenses depending on where you obtain it. For further 16*4882a593Smuzhiyun# details see https://www.openssl.org/~appro/cryptogams/. 17*4882a593Smuzhiyun# ==================================================================== 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun# SHA256 block procedure for ARMv4. May 2007. 20*4882a593Smuzhiyun 21*4882a593Smuzhiyun# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22*4882a593Smuzhiyun# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23*4882a593Smuzhiyun# byte [on single-issue Xscale PXA250 core]. 24*4882a593Smuzhiyun 25*4882a593Smuzhiyun# July 2010. 26*4882a593Smuzhiyun# 27*4882a593Smuzhiyun# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28*4882a593Smuzhiyun# Cortex A8 core and ~20 cycles per processed byte. 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun# February 2011. 31*4882a593Smuzhiyun# 32*4882a593Smuzhiyun# Profiler-assisted and platform-specific optimization resulted in 16% 33*4882a593Smuzhiyun# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun# September 2013. 36*4882a593Smuzhiyun# 37*4882a593Smuzhiyun# Add NEON implementation. On Cortex A8 it was measured to process one 38*4882a593Smuzhiyun# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39*4882a593Smuzhiyun# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40*4882a593Smuzhiyun# code (meaning that latter performs sub-optimally, nothing was done 41*4882a593Smuzhiyun# about it). 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun# May 2014. 44*4882a593Smuzhiyun# 45*4882a593Smuzhiyun# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46*4882a593Smuzhiyun 47*4882a593Smuzhiyunwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 48*4882a593Smuzhiyunopen STDOUT,">$output"; 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun$ctx="r0"; $t0="r0"; 51*4882a593Smuzhiyun$inp="r1"; $t4="r1"; 52*4882a593Smuzhiyun$len="r2"; $t1="r2"; 53*4882a593Smuzhiyun$T1="r3"; $t3="r3"; 54*4882a593Smuzhiyun$A="r4"; 55*4882a593Smuzhiyun$B="r5"; 56*4882a593Smuzhiyun$C="r6"; 57*4882a593Smuzhiyun$D="r7"; 58*4882a593Smuzhiyun$E="r8"; 59*4882a593Smuzhiyun$F="r9"; 60*4882a593Smuzhiyun$G="r10"; 61*4882a593Smuzhiyun$H="r11"; 62*4882a593Smuzhiyun@V=($A,$B,$C,$D,$E,$F,$G,$H); 63*4882a593Smuzhiyun$t2="r12"; 64*4882a593Smuzhiyun$Ktbl="r14"; 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun@Sigma0=( 2,13,22); 67*4882a593Smuzhiyun@Sigma1=( 6,11,25); 68*4882a593Smuzhiyun@sigma0=( 7,18, 3); 69*4882a593Smuzhiyun@sigma1=(17,19,10); 70*4882a593Smuzhiyun 71*4882a593Smuzhiyunsub BODY_00_15 { 72*4882a593Smuzhiyunmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun$code.=<<___ if ($i<16); 75*4882a593Smuzhiyun#if __ARM_ARCH__>=7 76*4882a593Smuzhiyun @ ldr $t1,[$inp],#4 @ $i 77*4882a593Smuzhiyun# if $i==15 78*4882a593Smuzhiyun str $inp,[sp,#17*4] @ make room for $t4 79*4882a593Smuzhiyun# endif 80*4882a593Smuzhiyun eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 81*4882a593Smuzhiyun add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 82*4882a593Smuzhiyun eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 83*4882a593Smuzhiyun# ifndef __ARMEB__ 84*4882a593Smuzhiyun rev $t1,$t1 85*4882a593Smuzhiyun# endif 86*4882a593Smuzhiyun#else 87*4882a593Smuzhiyun @ ldrb $t1,[$inp,#3] @ $i 88*4882a593Smuzhiyun add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 89*4882a593Smuzhiyun ldrb $t2,[$inp,#2] 90*4882a593Smuzhiyun ldrb $t0,[$inp,#1] 91*4882a593Smuzhiyun orr $t1,$t1,$t2,lsl#8 92*4882a593Smuzhiyun ldrb $t2,[$inp],#4 93*4882a593Smuzhiyun orr $t1,$t1,$t0,lsl#16 94*4882a593Smuzhiyun# if $i==15 95*4882a593Smuzhiyun str $inp,[sp,#17*4] @ make room for $t4 96*4882a593Smuzhiyun# endif 97*4882a593Smuzhiyun eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 98*4882a593Smuzhiyun orr $t1,$t1,$t2,lsl#24 99*4882a593Smuzhiyun eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 100*4882a593Smuzhiyun#endif 101*4882a593Smuzhiyun___ 102*4882a593Smuzhiyun$code.=<<___; 103*4882a593Smuzhiyun ldr $t2,[$Ktbl],#4 @ *K256++ 104*4882a593Smuzhiyun add $h,$h,$t1 @ h+=X[i] 105*4882a593Smuzhiyun str $t1,[sp,#`$i%16`*4] 106*4882a593Smuzhiyun eor $t1,$f,$g 107*4882a593Smuzhiyun add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 108*4882a593Smuzhiyun and $t1,$t1,$e 109*4882a593Smuzhiyun add $h,$h,$t2 @ h+=K256[i] 110*4882a593Smuzhiyun eor $t1,$t1,$g @ Ch(e,f,g) 111*4882a593Smuzhiyun eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 112*4882a593Smuzhiyun add $h,$h,$t1 @ h+=Ch(e,f,g) 113*4882a593Smuzhiyun#if $i==31 114*4882a593Smuzhiyun and $t2,$t2,#0xff 115*4882a593Smuzhiyun cmp $t2,#0xf2 @ done? 116*4882a593Smuzhiyun#endif 117*4882a593Smuzhiyun#if $i<15 118*4882a593Smuzhiyun# if __ARM_ARCH__>=7 119*4882a593Smuzhiyun ldr $t1,[$inp],#4 @ prefetch 120*4882a593Smuzhiyun# else 121*4882a593Smuzhiyun ldrb $t1,[$inp,#3] 122*4882a593Smuzhiyun# endif 123*4882a593Smuzhiyun eor $t2,$a,$b @ a^b, b^c in next round 124*4882a593Smuzhiyun#else 125*4882a593Smuzhiyun ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 126*4882a593Smuzhiyun eor $t2,$a,$b @ a^b, b^c in next round 127*4882a593Smuzhiyun ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 128*4882a593Smuzhiyun#endif 129*4882a593Smuzhiyun eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 130*4882a593Smuzhiyun and $t3,$t3,$t2 @ (b^c)&=(a^b) 131*4882a593Smuzhiyun add $d,$d,$h @ d+=h 132*4882a593Smuzhiyun eor $t3,$t3,$b @ Maj(a,b,c) 133*4882a593Smuzhiyun add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 134*4882a593Smuzhiyun @ add $h,$h,$t3 @ h+=Maj(a,b,c) 135*4882a593Smuzhiyun___ 136*4882a593Smuzhiyun ($t2,$t3)=($t3,$t2); 137*4882a593Smuzhiyun} 138*4882a593Smuzhiyun 139*4882a593Smuzhiyunsub BODY_16_XX { 140*4882a593Smuzhiyunmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 141*4882a593Smuzhiyun 142*4882a593Smuzhiyun$code.=<<___; 143*4882a593Smuzhiyun @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 144*4882a593Smuzhiyun @ ldr $t4,[sp,#`($i+14)%16`*4] 145*4882a593Smuzhiyun mov $t0,$t1,ror#$sigma0[0] 146*4882a593Smuzhiyun add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 147*4882a593Smuzhiyun mov $t2,$t4,ror#$sigma1[0] 148*4882a593Smuzhiyun eor $t0,$t0,$t1,ror#$sigma0[1] 149*4882a593Smuzhiyun eor $t2,$t2,$t4,ror#$sigma1[1] 150*4882a593Smuzhiyun eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 151*4882a593Smuzhiyun ldr $t1,[sp,#`($i+0)%16`*4] 152*4882a593Smuzhiyun eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 153*4882a593Smuzhiyun ldr $t4,[sp,#`($i+9)%16`*4] 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun add $t2,$t2,$t0 156*4882a593Smuzhiyun eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 157*4882a593Smuzhiyun add $t1,$t1,$t2 158*4882a593Smuzhiyun eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 159*4882a593Smuzhiyun add $t1,$t1,$t4 @ X[i] 160*4882a593Smuzhiyun___ 161*4882a593Smuzhiyun &BODY_00_15(@_); 162*4882a593Smuzhiyun} 163*4882a593Smuzhiyun 164*4882a593Smuzhiyun$code=<<___; 165*4882a593Smuzhiyun#ifndef __KERNEL__ 166*4882a593Smuzhiyun# include "arm_arch.h" 167*4882a593Smuzhiyun#else 168*4882a593Smuzhiyun# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 169*4882a593Smuzhiyun# define __ARM_MAX_ARCH__ 7 170*4882a593Smuzhiyun#endif 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun.text 173*4882a593Smuzhiyun#if __ARM_ARCH__<7 174*4882a593Smuzhiyun.code 32 175*4882a593Smuzhiyun#else 176*4882a593Smuzhiyun.syntax unified 177*4882a593Smuzhiyun# ifdef __thumb2__ 178*4882a593Smuzhiyun.thumb 179*4882a593Smuzhiyun# else 180*4882a593Smuzhiyun.code 32 181*4882a593Smuzhiyun# endif 182*4882a593Smuzhiyun#endif 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun.type K256,%object 185*4882a593Smuzhiyun.align 5 186*4882a593SmuzhiyunK256: 187*4882a593Smuzhiyun.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 188*4882a593Smuzhiyun.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 189*4882a593Smuzhiyun.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 190*4882a593Smuzhiyun.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 191*4882a593Smuzhiyun.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 192*4882a593Smuzhiyun.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 193*4882a593Smuzhiyun.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 194*4882a593Smuzhiyun.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 195*4882a593Smuzhiyun.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 196*4882a593Smuzhiyun.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 197*4882a593Smuzhiyun.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 198*4882a593Smuzhiyun.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 199*4882a593Smuzhiyun.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 200*4882a593Smuzhiyun.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 201*4882a593Smuzhiyun.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 202*4882a593Smuzhiyun.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 203*4882a593Smuzhiyun.size K256,.-K256 204*4882a593Smuzhiyun.word 0 @ terminator 205*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 206*4882a593Smuzhiyun.LOPENSSL_armcap: 207*4882a593Smuzhiyun.word OPENSSL_armcap_P-sha256_block_data_order 208*4882a593Smuzhiyun#endif 209*4882a593Smuzhiyun.align 5 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun.global sha256_block_data_order 212*4882a593Smuzhiyun.type sha256_block_data_order,%function 213*4882a593Smuzhiyunsha256_block_data_order: 214*4882a593Smuzhiyun.Lsha256_block_data_order: 215*4882a593Smuzhiyun#if __ARM_ARCH__<7 216*4882a593Smuzhiyun sub r3,pc,#8 @ sha256_block_data_order 217*4882a593Smuzhiyun#else 218*4882a593Smuzhiyun adr r3,.Lsha256_block_data_order 219*4882a593Smuzhiyun#endif 220*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 221*4882a593Smuzhiyun ldr r12,.LOPENSSL_armcap 222*4882a593Smuzhiyun ldr r12,[r3,r12] @ OPENSSL_armcap_P 223*4882a593Smuzhiyun tst r12,#ARMV8_SHA256 224*4882a593Smuzhiyun bne .LARMv8 225*4882a593Smuzhiyun tst r12,#ARMV7_NEON 226*4882a593Smuzhiyun bne .LNEON 227*4882a593Smuzhiyun#endif 228*4882a593Smuzhiyun add $len,$inp,$len,lsl#6 @ len to point at the end of inp 229*4882a593Smuzhiyun stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 230*4882a593Smuzhiyun ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 231*4882a593Smuzhiyun sub $Ktbl,r3,#256+32 @ K256 232*4882a593Smuzhiyun sub sp,sp,#16*4 @ alloca(X[16]) 233*4882a593Smuzhiyun.Loop: 234*4882a593Smuzhiyun# if __ARM_ARCH__>=7 235*4882a593Smuzhiyun ldr $t1,[$inp],#4 236*4882a593Smuzhiyun# else 237*4882a593Smuzhiyun ldrb $t1,[$inp,#3] 238*4882a593Smuzhiyun# endif 239*4882a593Smuzhiyun eor $t3,$B,$C @ magic 240*4882a593Smuzhiyun eor $t2,$t2,$t2 241*4882a593Smuzhiyun___ 242*4882a593Smuzhiyunfor($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 243*4882a593Smuzhiyun$code.=".Lrounds_16_xx:\n"; 244*4882a593Smuzhiyunfor (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 245*4882a593Smuzhiyun$code.=<<___; 246*4882a593Smuzhiyun#if __ARM_ARCH__>=7 247*4882a593Smuzhiyun ite eq @ Thumb2 thing, sanity check in ARM 248*4882a593Smuzhiyun#endif 249*4882a593Smuzhiyun ldreq $t3,[sp,#16*4] @ pull ctx 250*4882a593Smuzhiyun bne .Lrounds_16_xx 251*4882a593Smuzhiyun 252*4882a593Smuzhiyun add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 253*4882a593Smuzhiyun ldr $t0,[$t3,#0] 254*4882a593Smuzhiyun ldr $t1,[$t3,#4] 255*4882a593Smuzhiyun ldr $t2,[$t3,#8] 256*4882a593Smuzhiyun add $A,$A,$t0 257*4882a593Smuzhiyun ldr $t0,[$t3,#12] 258*4882a593Smuzhiyun add $B,$B,$t1 259*4882a593Smuzhiyun ldr $t1,[$t3,#16] 260*4882a593Smuzhiyun add $C,$C,$t2 261*4882a593Smuzhiyun ldr $t2,[$t3,#20] 262*4882a593Smuzhiyun add $D,$D,$t0 263*4882a593Smuzhiyun ldr $t0,[$t3,#24] 264*4882a593Smuzhiyun add $E,$E,$t1 265*4882a593Smuzhiyun ldr $t1,[$t3,#28] 266*4882a593Smuzhiyun add $F,$F,$t2 267*4882a593Smuzhiyun ldr $inp,[sp,#17*4] @ pull inp 268*4882a593Smuzhiyun ldr $t2,[sp,#18*4] @ pull inp+len 269*4882a593Smuzhiyun add $G,$G,$t0 270*4882a593Smuzhiyun add $H,$H,$t1 271*4882a593Smuzhiyun stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 272*4882a593Smuzhiyun cmp $inp,$t2 273*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 274*4882a593Smuzhiyun bne .Loop 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun add sp,sp,#`16+3`*4 @ destroy frame 277*4882a593Smuzhiyun#if __ARM_ARCH__>=5 278*4882a593Smuzhiyun ldmia sp!,{r4-r11,pc} 279*4882a593Smuzhiyun#else 280*4882a593Smuzhiyun ldmia sp!,{r4-r11,lr} 281*4882a593Smuzhiyun tst lr,#1 282*4882a593Smuzhiyun moveq pc,lr @ be binary compatible with V4, yet 283*4882a593Smuzhiyun bx lr @ interoperable with Thumb ISA:-) 284*4882a593Smuzhiyun#endif 285*4882a593Smuzhiyun.size sha256_block_data_order,.-sha256_block_data_order 286*4882a593Smuzhiyun___ 287*4882a593Smuzhiyun###################################################################### 288*4882a593Smuzhiyun# NEON stuff 289*4882a593Smuzhiyun# 290*4882a593Smuzhiyun{{{ 291*4882a593Smuzhiyunmy @X=map("q$_",(0..3)); 292*4882a593Smuzhiyunmy ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 293*4882a593Smuzhiyunmy $Xfer=$t4; 294*4882a593Smuzhiyunmy $j=0; 295*4882a593Smuzhiyun 296*4882a593Smuzhiyunsub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 297*4882a593Smuzhiyunsub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 298*4882a593Smuzhiyun 299*4882a593Smuzhiyunsub AUTOLOAD() # thunk [simplified] x86-style perlasm 300*4882a593Smuzhiyun{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 301*4882a593Smuzhiyun my $arg = pop; 302*4882a593Smuzhiyun $arg = "#$arg" if ($arg*1 eq $arg); 303*4882a593Smuzhiyun $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 304*4882a593Smuzhiyun} 305*4882a593Smuzhiyun 306*4882a593Smuzhiyunsub Xupdate() 307*4882a593Smuzhiyun{ use integer; 308*4882a593Smuzhiyun my $body = shift; 309*4882a593Smuzhiyun my @insns = (&$body,&$body,&$body,&$body); 310*4882a593Smuzhiyun my ($a,$b,$c,$d,$e,$f,$g,$h); 311*4882a593Smuzhiyun 312*4882a593Smuzhiyun &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 313*4882a593Smuzhiyun eval(shift(@insns)); 314*4882a593Smuzhiyun eval(shift(@insns)); 315*4882a593Smuzhiyun eval(shift(@insns)); 316*4882a593Smuzhiyun &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 317*4882a593Smuzhiyun eval(shift(@insns)); 318*4882a593Smuzhiyun eval(shift(@insns)); 319*4882a593Smuzhiyun eval(shift(@insns)); 320*4882a593Smuzhiyun &vshr_u32 ($T2,$T0,$sigma0[0]); 321*4882a593Smuzhiyun eval(shift(@insns)); 322*4882a593Smuzhiyun eval(shift(@insns)); 323*4882a593Smuzhiyun &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 324*4882a593Smuzhiyun eval(shift(@insns)); 325*4882a593Smuzhiyun eval(shift(@insns)); 326*4882a593Smuzhiyun &vshr_u32 ($T1,$T0,$sigma0[2]); 327*4882a593Smuzhiyun eval(shift(@insns)); 328*4882a593Smuzhiyun eval(shift(@insns)); 329*4882a593Smuzhiyun &vsli_32 ($T2,$T0,32-$sigma0[0]); 330*4882a593Smuzhiyun eval(shift(@insns)); 331*4882a593Smuzhiyun eval(shift(@insns)); 332*4882a593Smuzhiyun &vshr_u32 ($T3,$T0,$sigma0[1]); 333*4882a593Smuzhiyun eval(shift(@insns)); 334*4882a593Smuzhiyun eval(shift(@insns)); 335*4882a593Smuzhiyun &veor ($T1,$T1,$T2); 336*4882a593Smuzhiyun eval(shift(@insns)); 337*4882a593Smuzhiyun eval(shift(@insns)); 338*4882a593Smuzhiyun &vsli_32 ($T3,$T0,32-$sigma0[1]); 339*4882a593Smuzhiyun eval(shift(@insns)); 340*4882a593Smuzhiyun eval(shift(@insns)); 341*4882a593Smuzhiyun &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 342*4882a593Smuzhiyun eval(shift(@insns)); 343*4882a593Smuzhiyun eval(shift(@insns)); 344*4882a593Smuzhiyun &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 345*4882a593Smuzhiyun eval(shift(@insns)); 346*4882a593Smuzhiyun eval(shift(@insns)); 347*4882a593Smuzhiyun &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 348*4882a593Smuzhiyun eval(shift(@insns)); 349*4882a593Smuzhiyun eval(shift(@insns)); 350*4882a593Smuzhiyun &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 351*4882a593Smuzhiyun eval(shift(@insns)); 352*4882a593Smuzhiyun eval(shift(@insns)); 353*4882a593Smuzhiyun &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 354*4882a593Smuzhiyun eval(shift(@insns)); 355*4882a593Smuzhiyun eval(shift(@insns)); 356*4882a593Smuzhiyun &veor ($T5,$T5,$T4); 357*4882a593Smuzhiyun eval(shift(@insns)); 358*4882a593Smuzhiyun eval(shift(@insns)); 359*4882a593Smuzhiyun &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 360*4882a593Smuzhiyun eval(shift(@insns)); 361*4882a593Smuzhiyun eval(shift(@insns)); 362*4882a593Smuzhiyun &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 363*4882a593Smuzhiyun eval(shift(@insns)); 364*4882a593Smuzhiyun eval(shift(@insns)); 365*4882a593Smuzhiyun &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 366*4882a593Smuzhiyun eval(shift(@insns)); 367*4882a593Smuzhiyun eval(shift(@insns)); 368*4882a593Smuzhiyun &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 369*4882a593Smuzhiyun eval(shift(@insns)); 370*4882a593Smuzhiyun eval(shift(@insns)); 371*4882a593Smuzhiyun &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 372*4882a593Smuzhiyun eval(shift(@insns)); 373*4882a593Smuzhiyun eval(shift(@insns)); 374*4882a593Smuzhiyun &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 375*4882a593Smuzhiyun eval(shift(@insns)); 376*4882a593Smuzhiyun eval(shift(@insns)); 377*4882a593Smuzhiyun &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 378*4882a593Smuzhiyun eval(shift(@insns)); 379*4882a593Smuzhiyun eval(shift(@insns)); 380*4882a593Smuzhiyun &veor ($T5,$T5,$T4); 381*4882a593Smuzhiyun eval(shift(@insns)); 382*4882a593Smuzhiyun eval(shift(@insns)); 383*4882a593Smuzhiyun &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 384*4882a593Smuzhiyun eval(shift(@insns)); 385*4882a593Smuzhiyun eval(shift(@insns)); 386*4882a593Smuzhiyun &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 387*4882a593Smuzhiyun eval(shift(@insns)); 388*4882a593Smuzhiyun eval(shift(@insns)); 389*4882a593Smuzhiyun &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 390*4882a593Smuzhiyun eval(shift(@insns)); 391*4882a593Smuzhiyun eval(shift(@insns)); 392*4882a593Smuzhiyun &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 393*4882a593Smuzhiyun eval(shift(@insns)); 394*4882a593Smuzhiyun eval(shift(@insns)); 395*4882a593Smuzhiyun &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 396*4882a593Smuzhiyun eval(shift(@insns)); 397*4882a593Smuzhiyun eval(shift(@insns)); 398*4882a593Smuzhiyun &vadd_i32 ($T0,$T0,@X[0]); 399*4882a593Smuzhiyun while($#insns>=2) { eval(shift(@insns)); } 400*4882a593Smuzhiyun &vst1_32 ("{$T0}","[$Xfer,:128]!"); 401*4882a593Smuzhiyun eval(shift(@insns)); 402*4882a593Smuzhiyun eval(shift(@insns)); 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun push(@X,shift(@X)); # "rotate" X[] 405*4882a593Smuzhiyun} 406*4882a593Smuzhiyun 407*4882a593Smuzhiyunsub Xpreload() 408*4882a593Smuzhiyun{ use integer; 409*4882a593Smuzhiyun my $body = shift; 410*4882a593Smuzhiyun my @insns = (&$body,&$body,&$body,&$body); 411*4882a593Smuzhiyun my ($a,$b,$c,$d,$e,$f,$g,$h); 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun eval(shift(@insns)); 414*4882a593Smuzhiyun eval(shift(@insns)); 415*4882a593Smuzhiyun eval(shift(@insns)); 416*4882a593Smuzhiyun eval(shift(@insns)); 417*4882a593Smuzhiyun &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 418*4882a593Smuzhiyun eval(shift(@insns)); 419*4882a593Smuzhiyun eval(shift(@insns)); 420*4882a593Smuzhiyun eval(shift(@insns)); 421*4882a593Smuzhiyun eval(shift(@insns)); 422*4882a593Smuzhiyun &vrev32_8 (@X[0],@X[0]); 423*4882a593Smuzhiyun eval(shift(@insns)); 424*4882a593Smuzhiyun eval(shift(@insns)); 425*4882a593Smuzhiyun eval(shift(@insns)); 426*4882a593Smuzhiyun eval(shift(@insns)); 427*4882a593Smuzhiyun &vadd_i32 ($T0,$T0,@X[0]); 428*4882a593Smuzhiyun foreach (@insns) { eval; } # remaining instructions 429*4882a593Smuzhiyun &vst1_32 ("{$T0}","[$Xfer,:128]!"); 430*4882a593Smuzhiyun 431*4882a593Smuzhiyun push(@X,shift(@X)); # "rotate" X[] 432*4882a593Smuzhiyun} 433*4882a593Smuzhiyun 434*4882a593Smuzhiyunsub body_00_15 () { 435*4882a593Smuzhiyun ( 436*4882a593Smuzhiyun '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 437*4882a593Smuzhiyun '&add ($h,$h,$t1)', # h+=X[i]+K[i] 438*4882a593Smuzhiyun '&eor ($t1,$f,$g)', 439*4882a593Smuzhiyun '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 440*4882a593Smuzhiyun '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 441*4882a593Smuzhiyun '&and ($t1,$t1,$e)', 442*4882a593Smuzhiyun '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 443*4882a593Smuzhiyun '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 444*4882a593Smuzhiyun '&eor ($t1,$t1,$g)', # Ch(e,f,g) 445*4882a593Smuzhiyun '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 446*4882a593Smuzhiyun '&eor ($t2,$a,$b)', # a^b, b^c in next round 447*4882a593Smuzhiyun '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 448*4882a593Smuzhiyun '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 449*4882a593Smuzhiyun '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 450*4882a593Smuzhiyun '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 451*4882a593Smuzhiyun '&ldr ($t1,"[sp,#64]") if ($j==31)', 452*4882a593Smuzhiyun '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 453*4882a593Smuzhiyun '&add ($d,$d,$h)', # d+=h 454*4882a593Smuzhiyun '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 455*4882a593Smuzhiyun '&eor ($t3,$t3,$b)', # Maj(a,b,c) 456*4882a593Smuzhiyun '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 457*4882a593Smuzhiyun ) 458*4882a593Smuzhiyun} 459*4882a593Smuzhiyun 460*4882a593Smuzhiyun$code.=<<___; 461*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 462*4882a593Smuzhiyun.arch armv7-a 463*4882a593Smuzhiyun.fpu neon 464*4882a593Smuzhiyun 465*4882a593Smuzhiyun.global sha256_block_data_order_neon 466*4882a593Smuzhiyun.type sha256_block_data_order_neon,%function 467*4882a593Smuzhiyun.align 4 468*4882a593Smuzhiyunsha256_block_data_order_neon: 469*4882a593Smuzhiyun.LNEON: 470*4882a593Smuzhiyun stmdb sp!,{r4-r12,lr} 471*4882a593Smuzhiyun 472*4882a593Smuzhiyun sub $H,sp,#16*4+16 473*4882a593Smuzhiyun adr $Ktbl,.Lsha256_block_data_order 474*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256 475*4882a593Smuzhiyun bic $H,$H,#15 @ align for 128-bit stores 476*4882a593Smuzhiyun mov $t2,sp 477*4882a593Smuzhiyun mov sp,$H @ alloca 478*4882a593Smuzhiyun add $len,$inp,$len,lsl#6 @ len to point at the end of inp 479*4882a593Smuzhiyun 480*4882a593Smuzhiyun vld1.8 {@X[0]},[$inp]! 481*4882a593Smuzhiyun vld1.8 {@X[1]},[$inp]! 482*4882a593Smuzhiyun vld1.8 {@X[2]},[$inp]! 483*4882a593Smuzhiyun vld1.8 {@X[3]},[$inp]! 484*4882a593Smuzhiyun vld1.32 {$T0},[$Ktbl,:128]! 485*4882a593Smuzhiyun vld1.32 {$T1},[$Ktbl,:128]! 486*4882a593Smuzhiyun vld1.32 {$T2},[$Ktbl,:128]! 487*4882a593Smuzhiyun vld1.32 {$T3},[$Ktbl,:128]! 488*4882a593Smuzhiyun vrev32.8 @X[0],@X[0] @ yes, even on 489*4882a593Smuzhiyun str $ctx,[sp,#64] 490*4882a593Smuzhiyun vrev32.8 @X[1],@X[1] @ big-endian 491*4882a593Smuzhiyun str $inp,[sp,#68] 492*4882a593Smuzhiyun mov $Xfer,sp 493*4882a593Smuzhiyun vrev32.8 @X[2],@X[2] 494*4882a593Smuzhiyun str $len,[sp,#72] 495*4882a593Smuzhiyun vrev32.8 @X[3],@X[3] 496*4882a593Smuzhiyun str $t2,[sp,#76] @ save original sp 497*4882a593Smuzhiyun vadd.i32 $T0,$T0,@X[0] 498*4882a593Smuzhiyun vadd.i32 $T1,$T1,@X[1] 499*4882a593Smuzhiyun vst1.32 {$T0},[$Xfer,:128]! 500*4882a593Smuzhiyun vadd.i32 $T2,$T2,@X[2] 501*4882a593Smuzhiyun vst1.32 {$T1},[$Xfer,:128]! 502*4882a593Smuzhiyun vadd.i32 $T3,$T3,@X[3] 503*4882a593Smuzhiyun vst1.32 {$T2},[$Xfer,:128]! 504*4882a593Smuzhiyun vst1.32 {$T3},[$Xfer,:128]! 505*4882a593Smuzhiyun 506*4882a593Smuzhiyun ldmia $ctx,{$A-$H} 507*4882a593Smuzhiyun sub $Xfer,$Xfer,#64 508*4882a593Smuzhiyun ldr $t1,[sp,#0] 509*4882a593Smuzhiyun eor $t2,$t2,$t2 510*4882a593Smuzhiyun eor $t3,$B,$C 511*4882a593Smuzhiyun b .L_00_48 512*4882a593Smuzhiyun 513*4882a593Smuzhiyun.align 4 514*4882a593Smuzhiyun.L_00_48: 515*4882a593Smuzhiyun___ 516*4882a593Smuzhiyun &Xupdate(\&body_00_15); 517*4882a593Smuzhiyun &Xupdate(\&body_00_15); 518*4882a593Smuzhiyun &Xupdate(\&body_00_15); 519*4882a593Smuzhiyun &Xupdate(\&body_00_15); 520*4882a593Smuzhiyun$code.=<<___; 521*4882a593Smuzhiyun teq $t1,#0 @ check for K256 terminator 522*4882a593Smuzhiyun ldr $t1,[sp,#0] 523*4882a593Smuzhiyun sub $Xfer,$Xfer,#64 524*4882a593Smuzhiyun bne .L_00_48 525*4882a593Smuzhiyun 526*4882a593Smuzhiyun ldr $inp,[sp,#68] 527*4882a593Smuzhiyun ldr $t0,[sp,#72] 528*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 529*4882a593Smuzhiyun teq $inp,$t0 530*4882a593Smuzhiyun it eq 531*4882a593Smuzhiyun subeq $inp,$inp,#64 @ avoid SEGV 532*4882a593Smuzhiyun vld1.8 {@X[0]},[$inp]! @ load next input block 533*4882a593Smuzhiyun vld1.8 {@X[1]},[$inp]! 534*4882a593Smuzhiyun vld1.8 {@X[2]},[$inp]! 535*4882a593Smuzhiyun vld1.8 {@X[3]},[$inp]! 536*4882a593Smuzhiyun it ne 537*4882a593Smuzhiyun strne $inp,[sp,#68] 538*4882a593Smuzhiyun mov $Xfer,sp 539*4882a593Smuzhiyun___ 540*4882a593Smuzhiyun &Xpreload(\&body_00_15); 541*4882a593Smuzhiyun &Xpreload(\&body_00_15); 542*4882a593Smuzhiyun &Xpreload(\&body_00_15); 543*4882a593Smuzhiyun &Xpreload(\&body_00_15); 544*4882a593Smuzhiyun$code.=<<___; 545*4882a593Smuzhiyun ldr $t0,[$t1,#0] 546*4882a593Smuzhiyun add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 547*4882a593Smuzhiyun ldr $t2,[$t1,#4] 548*4882a593Smuzhiyun ldr $t3,[$t1,#8] 549*4882a593Smuzhiyun ldr $t4,[$t1,#12] 550*4882a593Smuzhiyun add $A,$A,$t0 @ accumulate 551*4882a593Smuzhiyun ldr $t0,[$t1,#16] 552*4882a593Smuzhiyun add $B,$B,$t2 553*4882a593Smuzhiyun ldr $t2,[$t1,#20] 554*4882a593Smuzhiyun add $C,$C,$t3 555*4882a593Smuzhiyun ldr $t3,[$t1,#24] 556*4882a593Smuzhiyun add $D,$D,$t4 557*4882a593Smuzhiyun ldr $t4,[$t1,#28] 558*4882a593Smuzhiyun add $E,$E,$t0 559*4882a593Smuzhiyun str $A,[$t1],#4 560*4882a593Smuzhiyun add $F,$F,$t2 561*4882a593Smuzhiyun str $B,[$t1],#4 562*4882a593Smuzhiyun add $G,$G,$t3 563*4882a593Smuzhiyun str $C,[$t1],#4 564*4882a593Smuzhiyun add $H,$H,$t4 565*4882a593Smuzhiyun str $D,[$t1],#4 566*4882a593Smuzhiyun stmia $t1,{$E-$H} 567*4882a593Smuzhiyun 568*4882a593Smuzhiyun ittte ne 569*4882a593Smuzhiyun movne $Xfer,sp 570*4882a593Smuzhiyun ldrne $t1,[sp,#0] 571*4882a593Smuzhiyun eorne $t2,$t2,$t2 572*4882a593Smuzhiyun ldreq sp,[sp,#76] @ restore original sp 573*4882a593Smuzhiyun itt ne 574*4882a593Smuzhiyun eorne $t3,$B,$C 575*4882a593Smuzhiyun bne .L_00_48 576*4882a593Smuzhiyun 577*4882a593Smuzhiyun ldmia sp!,{r4-r12,pc} 578*4882a593Smuzhiyun.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 579*4882a593Smuzhiyun#endif 580*4882a593Smuzhiyun___ 581*4882a593Smuzhiyun}}} 582*4882a593Smuzhiyun###################################################################### 583*4882a593Smuzhiyun# ARMv8 stuff 584*4882a593Smuzhiyun# 585*4882a593Smuzhiyun{{{ 586*4882a593Smuzhiyunmy ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 587*4882a593Smuzhiyunmy @MSG=map("q$_",(8..11)); 588*4882a593Smuzhiyunmy ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 589*4882a593Smuzhiyunmy $Ktbl="r3"; 590*4882a593Smuzhiyun 591*4882a593Smuzhiyun$code.=<<___; 592*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 593*4882a593Smuzhiyun 594*4882a593Smuzhiyun# ifdef __thumb2__ 595*4882a593Smuzhiyun# define INST(a,b,c,d) .byte c,d|0xc,a,b 596*4882a593Smuzhiyun# else 597*4882a593Smuzhiyun# define INST(a,b,c,d) .byte a,b,c,d 598*4882a593Smuzhiyun# endif 599*4882a593Smuzhiyun 600*4882a593Smuzhiyun.type sha256_block_data_order_armv8,%function 601*4882a593Smuzhiyun.align 5 602*4882a593Smuzhiyunsha256_block_data_order_armv8: 603*4882a593Smuzhiyun.LARMv8: 604*4882a593Smuzhiyun vld1.32 {$ABCD,$EFGH},[$ctx] 605*4882a593Smuzhiyun# ifdef __thumb2__ 606*4882a593Smuzhiyun adr $Ktbl,.LARMv8 607*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#.LARMv8-K256 608*4882a593Smuzhiyun# else 609*4882a593Smuzhiyun adrl $Ktbl,K256 610*4882a593Smuzhiyun# endif 611*4882a593Smuzhiyun add $len,$inp,$len,lsl#6 @ len to point at the end of inp 612*4882a593Smuzhiyun 613*4882a593Smuzhiyun.Loop_v8: 614*4882a593Smuzhiyun vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 615*4882a593Smuzhiyun vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 616*4882a593Smuzhiyun vld1.32 {$W0},[$Ktbl]! 617*4882a593Smuzhiyun vrev32.8 @MSG[0],@MSG[0] 618*4882a593Smuzhiyun vrev32.8 @MSG[1],@MSG[1] 619*4882a593Smuzhiyun vrev32.8 @MSG[2],@MSG[2] 620*4882a593Smuzhiyun vrev32.8 @MSG[3],@MSG[3] 621*4882a593Smuzhiyun vmov $ABCD_SAVE,$ABCD @ offload 622*4882a593Smuzhiyun vmov $EFGH_SAVE,$EFGH 623*4882a593Smuzhiyun teq $inp,$len 624*4882a593Smuzhiyun___ 625*4882a593Smuzhiyunfor($i=0;$i<12;$i++) { 626*4882a593Smuzhiyun$code.=<<___; 627*4882a593Smuzhiyun vld1.32 {$W1},[$Ktbl]! 628*4882a593Smuzhiyun vadd.i32 $W0,$W0,@MSG[0] 629*4882a593Smuzhiyun sha256su0 @MSG[0],@MSG[1] 630*4882a593Smuzhiyun vmov $abcd,$ABCD 631*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W0 632*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W0 633*4882a593Smuzhiyun sha256su1 @MSG[0],@MSG[2],@MSG[3] 634*4882a593Smuzhiyun___ 635*4882a593Smuzhiyun ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 636*4882a593Smuzhiyun} 637*4882a593Smuzhiyun$code.=<<___; 638*4882a593Smuzhiyun vld1.32 {$W1},[$Ktbl]! 639*4882a593Smuzhiyun vadd.i32 $W0,$W0,@MSG[0] 640*4882a593Smuzhiyun vmov $abcd,$ABCD 641*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W0 642*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W0 643*4882a593Smuzhiyun 644*4882a593Smuzhiyun vld1.32 {$W0},[$Ktbl]! 645*4882a593Smuzhiyun vadd.i32 $W1,$W1,@MSG[1] 646*4882a593Smuzhiyun vmov $abcd,$ABCD 647*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W1 648*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W1 649*4882a593Smuzhiyun 650*4882a593Smuzhiyun vld1.32 {$W1},[$Ktbl] 651*4882a593Smuzhiyun vadd.i32 $W0,$W0,@MSG[2] 652*4882a593Smuzhiyun sub $Ktbl,$Ktbl,#256-16 @ rewind 653*4882a593Smuzhiyun vmov $abcd,$ABCD 654*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W0 655*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W0 656*4882a593Smuzhiyun 657*4882a593Smuzhiyun vadd.i32 $W1,$W1,@MSG[3] 658*4882a593Smuzhiyun vmov $abcd,$ABCD 659*4882a593Smuzhiyun sha256h $ABCD,$EFGH,$W1 660*4882a593Smuzhiyun sha256h2 $EFGH,$abcd,$W1 661*4882a593Smuzhiyun 662*4882a593Smuzhiyun vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 663*4882a593Smuzhiyun vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 664*4882a593Smuzhiyun it ne 665*4882a593Smuzhiyun bne .Loop_v8 666*4882a593Smuzhiyun 667*4882a593Smuzhiyun vst1.32 {$ABCD,$EFGH},[$ctx] 668*4882a593Smuzhiyun 669*4882a593Smuzhiyun ret @ bx lr 670*4882a593Smuzhiyun.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 671*4882a593Smuzhiyun#endif 672*4882a593Smuzhiyun___ 673*4882a593Smuzhiyun}}} 674*4882a593Smuzhiyun$code.=<<___; 675*4882a593Smuzhiyun.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 676*4882a593Smuzhiyun.align 2 677*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 678*4882a593Smuzhiyun.comm OPENSSL_armcap_P,4,4 679*4882a593Smuzhiyun#endif 680*4882a593Smuzhiyun___ 681*4882a593Smuzhiyun 682*4882a593Smuzhiyunopen SELF,$0; 683*4882a593Smuzhiyunwhile(<SELF>) { 684*4882a593Smuzhiyun next if (/^#!/); 685*4882a593Smuzhiyun last if (!s/^#/@/ and !/^$/); 686*4882a593Smuzhiyun print; 687*4882a593Smuzhiyun} 688*4882a593Smuzhiyunclose SELF; 689*4882a593Smuzhiyun 690*4882a593Smuzhiyun{ my %opcode = ( 691*4882a593Smuzhiyun "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 692*4882a593Smuzhiyun "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 693*4882a593Smuzhiyun 694*4882a593Smuzhiyun sub unsha256 { 695*4882a593Smuzhiyun my ($mnemonic,$arg)=@_; 696*4882a593Smuzhiyun 697*4882a593Smuzhiyun if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 698*4882a593Smuzhiyun my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 699*4882a593Smuzhiyun |(($2&7)<<17)|(($2&8)<<4) 700*4882a593Smuzhiyun |(($3&7)<<1) |(($3&8)<<2); 701*4882a593Smuzhiyun # since ARMv7 instructions are always encoded little-endian. 702*4882a593Smuzhiyun # correct solution is to use .inst directive, but older 703*4882a593Smuzhiyun # assemblers don't implement it:-( 704*4882a593Smuzhiyun sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 705*4882a593Smuzhiyun $word&0xff,($word>>8)&0xff, 706*4882a593Smuzhiyun ($word>>16)&0xff,($word>>24)&0xff, 707*4882a593Smuzhiyun $mnemonic,$arg; 708*4882a593Smuzhiyun } 709*4882a593Smuzhiyun } 710*4882a593Smuzhiyun} 711*4882a593Smuzhiyun 712*4882a593Smuzhiyunforeach (split($/,$code)) { 713*4882a593Smuzhiyun 714*4882a593Smuzhiyun s/\`([^\`]*)\`/eval $1/geo; 715*4882a593Smuzhiyun 716*4882a593Smuzhiyun s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 717*4882a593Smuzhiyun 718*4882a593Smuzhiyun s/\bret\b/bx lr/go or 719*4882a593Smuzhiyun s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 720*4882a593Smuzhiyun 721*4882a593Smuzhiyun print $_,"\n"; 722*4882a593Smuzhiyun} 723*4882a593Smuzhiyun 724*4882a593Smuzhiyunclose STDOUT; # enforce flush 725