1*4882a593Smuzhiyun#define __ARM_ARCH__ __LINUX_ARM_ARCH__ 2*4882a593Smuzhiyun@ SPDX-License-Identifier: GPL-2.0 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun@ This code is taken from the OpenSSL project but the author (Andy Polyakov) 5*4882a593Smuzhiyun@ has relicensed it under the GPLv2. Therefore this program is free software; 6*4882a593Smuzhiyun@ you can redistribute it and/or modify it under the terms of the GNU General 7*4882a593Smuzhiyun@ Public License version 2 as published by the Free Software Foundation. 8*4882a593Smuzhiyun@ 9*4882a593Smuzhiyun@ The original headers, including the original license headers, are 10*4882a593Smuzhiyun@ included below for completeness. 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun@ ==================================================================== 13*4882a593Smuzhiyun@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 14*4882a593Smuzhiyun@ project. The module is, however, dual licensed under OpenSSL and 15*4882a593Smuzhiyun@ CRYPTOGAMS licenses depending on where you obtain it. For further 16*4882a593Smuzhiyun@ details see https://www.openssl.org/~appro/cryptogams/. 17*4882a593Smuzhiyun@ ==================================================================== 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun@ sha1_block procedure for ARMv4. 20*4882a593Smuzhiyun@ 21*4882a593Smuzhiyun@ January 2007. 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun@ Size/performance trade-off 24*4882a593Smuzhiyun@ ==================================================================== 25*4882a593Smuzhiyun@ impl size in bytes comp cycles[*] measured performance 26*4882a593Smuzhiyun@ ==================================================================== 27*4882a593Smuzhiyun@ thumb 304 3212 4420 28*4882a593Smuzhiyun@ armv4-small 392/+29% 1958/+64% 2250/+96% 29*4882a593Smuzhiyun@ armv4-compact 740/+89% 1552/+26% 1840/+22% 30*4882a593Smuzhiyun@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 31*4882a593Smuzhiyun@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% 32*4882a593Smuzhiyun@ ==================================================================== 33*4882a593Smuzhiyun@ thumb = same as 'small' but in Thumb instructions[**] and 34*4882a593Smuzhiyun@ with recurring code in two private functions; 35*4882a593Smuzhiyun@ small = detached Xload/update, loops are folded; 36*4882a593Smuzhiyun@ compact = detached Xload/update, 5x unroll; 37*4882a593Smuzhiyun@ large = interleaved Xload/update, 5x unroll; 38*4882a593Smuzhiyun@ full unroll = interleaved Xload/update, full unroll, estimated[!]; 39*4882a593Smuzhiyun@ 40*4882a593Smuzhiyun@ [*] Manually counted instructions in "grand" loop body. Measured 41*4882a593Smuzhiyun@ performance is affected by prologue and epilogue overhead, 42*4882a593Smuzhiyun@ i-cache availability, branch penalties, etc. 43*4882a593Smuzhiyun@ [**] While each Thumb instruction is twice smaller, they are not as 44*4882a593Smuzhiyun@ diverse as ARM ones: e.g., there are only two arithmetic 45*4882a593Smuzhiyun@ instructions with 3 arguments, no [fixed] rotate, addressing 46*4882a593Smuzhiyun@ modes are limited. As result it takes more instructions to do 47*4882a593Smuzhiyun@ the same job in Thumb, therefore the code is never twice as 48*4882a593Smuzhiyun@ small and always slower. 49*4882a593Smuzhiyun@ [***] which is also ~35% better than compiler generated code. Dual- 50*4882a593Smuzhiyun@ issue Cortex A8 core was measured to process input block in 51*4882a593Smuzhiyun@ ~990 cycles. 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun@ August 2010. 54*4882a593Smuzhiyun@ 55*4882a593Smuzhiyun@ Rescheduling for dual-issue pipeline resulted in 13% improvement on 56*4882a593Smuzhiyun@ Cortex A8 core and in absolute terms ~870 cycles per input block 57*4882a593Smuzhiyun@ [or 13.6 cycles per byte]. 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun@ February 2011. 60*4882a593Smuzhiyun@ 61*4882a593Smuzhiyun@ Profiler-assisted and platform-specific optimization resulted in 10% 62*4882a593Smuzhiyun@ improvement on Cortex A8 core and 12.2 cycles per byte. 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun#include <linux/linkage.h> 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun.text 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun.align 2 69*4882a593SmuzhiyunENTRY(sha1_block_data_order) 70*4882a593Smuzhiyun stmdb sp!,{r4-r12,lr} 71*4882a593Smuzhiyun add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 72*4882a593Smuzhiyun ldmia r0,{r3,r4,r5,r6,r7} 73*4882a593Smuzhiyun.Lloop: 74*4882a593Smuzhiyun ldr r8,.LK_00_19 75*4882a593Smuzhiyun mov r14,sp 76*4882a593Smuzhiyun sub sp,sp,#15*4 77*4882a593Smuzhiyun mov r5,r5,ror#30 78*4882a593Smuzhiyun mov r6,r6,ror#30 79*4882a593Smuzhiyun mov r7,r7,ror#30 @ [6] 80*4882a593Smuzhiyun.L_00_15: 81*4882a593Smuzhiyun#if __ARM_ARCH__<7 82*4882a593Smuzhiyun ldrb r10,[r1,#2] 83*4882a593Smuzhiyun ldrb r9,[r1,#3] 84*4882a593Smuzhiyun ldrb r11,[r1,#1] 85*4882a593Smuzhiyun add r7,r8,r7,ror#2 @ E+=K_00_19 86*4882a593Smuzhiyun ldrb r12,[r1],#4 87*4882a593Smuzhiyun orr r9,r9,r10,lsl#8 88*4882a593Smuzhiyun eor r10,r5,r6 @ F_xx_xx 89*4882a593Smuzhiyun orr r9,r9,r11,lsl#16 90*4882a593Smuzhiyun add r7,r7,r3,ror#27 @ E+=ROR(A,27) 91*4882a593Smuzhiyun orr r9,r9,r12,lsl#24 92*4882a593Smuzhiyun#else 93*4882a593Smuzhiyun ldr r9,[r1],#4 @ handles unaligned 94*4882a593Smuzhiyun add r7,r8,r7,ror#2 @ E+=K_00_19 95*4882a593Smuzhiyun eor r10,r5,r6 @ F_xx_xx 96*4882a593Smuzhiyun add r7,r7,r3,ror#27 @ E+=ROR(A,27) 97*4882a593Smuzhiyun#ifdef __ARMEL__ 98*4882a593Smuzhiyun rev r9,r9 @ byte swap 99*4882a593Smuzhiyun#endif 100*4882a593Smuzhiyun#endif 101*4882a593Smuzhiyun and r10,r4,r10,ror#2 102*4882a593Smuzhiyun add r7,r7,r9 @ E+=X[i] 103*4882a593Smuzhiyun eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 104*4882a593Smuzhiyun str r9,[r14,#-4]! 105*4882a593Smuzhiyun add r7,r7,r10 @ E+=F_00_19(B,C,D) 106*4882a593Smuzhiyun#if __ARM_ARCH__<7 107*4882a593Smuzhiyun ldrb r10,[r1,#2] 108*4882a593Smuzhiyun ldrb r9,[r1,#3] 109*4882a593Smuzhiyun ldrb r11,[r1,#1] 110*4882a593Smuzhiyun add r6,r8,r6,ror#2 @ E+=K_00_19 111*4882a593Smuzhiyun ldrb r12,[r1],#4 112*4882a593Smuzhiyun orr r9,r9,r10,lsl#8 113*4882a593Smuzhiyun eor r10,r4,r5 @ F_xx_xx 114*4882a593Smuzhiyun orr r9,r9,r11,lsl#16 115*4882a593Smuzhiyun add r6,r6,r7,ror#27 @ E+=ROR(A,27) 116*4882a593Smuzhiyun orr r9,r9,r12,lsl#24 117*4882a593Smuzhiyun#else 118*4882a593Smuzhiyun ldr r9,[r1],#4 @ handles unaligned 119*4882a593Smuzhiyun add r6,r8,r6,ror#2 @ E+=K_00_19 120*4882a593Smuzhiyun eor r10,r4,r5 @ F_xx_xx 121*4882a593Smuzhiyun add r6,r6,r7,ror#27 @ E+=ROR(A,27) 122*4882a593Smuzhiyun#ifdef __ARMEL__ 123*4882a593Smuzhiyun rev r9,r9 @ byte swap 124*4882a593Smuzhiyun#endif 125*4882a593Smuzhiyun#endif 126*4882a593Smuzhiyun and r10,r3,r10,ror#2 127*4882a593Smuzhiyun add r6,r6,r9 @ E+=X[i] 128*4882a593Smuzhiyun eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 129*4882a593Smuzhiyun str r9,[r14,#-4]! 130*4882a593Smuzhiyun add r6,r6,r10 @ E+=F_00_19(B,C,D) 131*4882a593Smuzhiyun#if __ARM_ARCH__<7 132*4882a593Smuzhiyun ldrb r10,[r1,#2] 133*4882a593Smuzhiyun ldrb r9,[r1,#3] 134*4882a593Smuzhiyun ldrb r11,[r1,#1] 135*4882a593Smuzhiyun add r5,r8,r5,ror#2 @ E+=K_00_19 136*4882a593Smuzhiyun ldrb r12,[r1],#4 137*4882a593Smuzhiyun orr r9,r9,r10,lsl#8 138*4882a593Smuzhiyun eor r10,r3,r4 @ F_xx_xx 139*4882a593Smuzhiyun orr r9,r9,r11,lsl#16 140*4882a593Smuzhiyun add r5,r5,r6,ror#27 @ E+=ROR(A,27) 141*4882a593Smuzhiyun orr r9,r9,r12,lsl#24 142*4882a593Smuzhiyun#else 143*4882a593Smuzhiyun ldr r9,[r1],#4 @ handles unaligned 144*4882a593Smuzhiyun add r5,r8,r5,ror#2 @ E+=K_00_19 145*4882a593Smuzhiyun eor r10,r3,r4 @ F_xx_xx 146*4882a593Smuzhiyun add r5,r5,r6,ror#27 @ E+=ROR(A,27) 147*4882a593Smuzhiyun#ifdef __ARMEL__ 148*4882a593Smuzhiyun rev r9,r9 @ byte swap 149*4882a593Smuzhiyun#endif 150*4882a593Smuzhiyun#endif 151*4882a593Smuzhiyun and r10,r7,r10,ror#2 152*4882a593Smuzhiyun add r5,r5,r9 @ E+=X[i] 153*4882a593Smuzhiyun eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 154*4882a593Smuzhiyun str r9,[r14,#-4]! 155*4882a593Smuzhiyun add r5,r5,r10 @ E+=F_00_19(B,C,D) 156*4882a593Smuzhiyun#if __ARM_ARCH__<7 157*4882a593Smuzhiyun ldrb r10,[r1,#2] 158*4882a593Smuzhiyun ldrb r9,[r1,#3] 159*4882a593Smuzhiyun ldrb r11,[r1,#1] 160*4882a593Smuzhiyun add r4,r8,r4,ror#2 @ E+=K_00_19 161*4882a593Smuzhiyun ldrb r12,[r1],#4 162*4882a593Smuzhiyun orr r9,r9,r10,lsl#8 163*4882a593Smuzhiyun eor r10,r7,r3 @ F_xx_xx 164*4882a593Smuzhiyun orr r9,r9,r11,lsl#16 165*4882a593Smuzhiyun add r4,r4,r5,ror#27 @ E+=ROR(A,27) 166*4882a593Smuzhiyun orr r9,r9,r12,lsl#24 167*4882a593Smuzhiyun#else 168*4882a593Smuzhiyun ldr r9,[r1],#4 @ handles unaligned 169*4882a593Smuzhiyun add r4,r8,r4,ror#2 @ E+=K_00_19 170*4882a593Smuzhiyun eor r10,r7,r3 @ F_xx_xx 171*4882a593Smuzhiyun add r4,r4,r5,ror#27 @ E+=ROR(A,27) 172*4882a593Smuzhiyun#ifdef __ARMEL__ 173*4882a593Smuzhiyun rev r9,r9 @ byte swap 174*4882a593Smuzhiyun#endif 175*4882a593Smuzhiyun#endif 176*4882a593Smuzhiyun and r10,r6,r10,ror#2 177*4882a593Smuzhiyun add r4,r4,r9 @ E+=X[i] 178*4882a593Smuzhiyun eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 179*4882a593Smuzhiyun str r9,[r14,#-4]! 180*4882a593Smuzhiyun add r4,r4,r10 @ E+=F_00_19(B,C,D) 181*4882a593Smuzhiyun#if __ARM_ARCH__<7 182*4882a593Smuzhiyun ldrb r10,[r1,#2] 183*4882a593Smuzhiyun ldrb r9,[r1,#3] 184*4882a593Smuzhiyun ldrb r11,[r1,#1] 185*4882a593Smuzhiyun add r3,r8,r3,ror#2 @ E+=K_00_19 186*4882a593Smuzhiyun ldrb r12,[r1],#4 187*4882a593Smuzhiyun orr r9,r9,r10,lsl#8 188*4882a593Smuzhiyun eor r10,r6,r7 @ F_xx_xx 189*4882a593Smuzhiyun orr r9,r9,r11,lsl#16 190*4882a593Smuzhiyun add r3,r3,r4,ror#27 @ E+=ROR(A,27) 191*4882a593Smuzhiyun orr r9,r9,r12,lsl#24 192*4882a593Smuzhiyun#else 193*4882a593Smuzhiyun ldr r9,[r1],#4 @ handles unaligned 194*4882a593Smuzhiyun add r3,r8,r3,ror#2 @ E+=K_00_19 195*4882a593Smuzhiyun eor r10,r6,r7 @ F_xx_xx 196*4882a593Smuzhiyun add r3,r3,r4,ror#27 @ E+=ROR(A,27) 197*4882a593Smuzhiyun#ifdef __ARMEL__ 198*4882a593Smuzhiyun rev r9,r9 @ byte swap 199*4882a593Smuzhiyun#endif 200*4882a593Smuzhiyun#endif 201*4882a593Smuzhiyun and r10,r5,r10,ror#2 202*4882a593Smuzhiyun add r3,r3,r9 @ E+=X[i] 203*4882a593Smuzhiyun eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 204*4882a593Smuzhiyun str r9,[r14,#-4]! 205*4882a593Smuzhiyun add r3,r3,r10 @ E+=F_00_19(B,C,D) 206*4882a593Smuzhiyun cmp r14,sp 207*4882a593Smuzhiyun bne .L_00_15 @ [((11+4)*5+2)*3] 208*4882a593Smuzhiyun sub sp,sp,#25*4 209*4882a593Smuzhiyun#if __ARM_ARCH__<7 210*4882a593Smuzhiyun ldrb r10,[r1,#2] 211*4882a593Smuzhiyun ldrb r9,[r1,#3] 212*4882a593Smuzhiyun ldrb r11,[r1,#1] 213*4882a593Smuzhiyun add r7,r8,r7,ror#2 @ E+=K_00_19 214*4882a593Smuzhiyun ldrb r12,[r1],#4 215*4882a593Smuzhiyun orr r9,r9,r10,lsl#8 216*4882a593Smuzhiyun eor r10,r5,r6 @ F_xx_xx 217*4882a593Smuzhiyun orr r9,r9,r11,lsl#16 218*4882a593Smuzhiyun add r7,r7,r3,ror#27 @ E+=ROR(A,27) 219*4882a593Smuzhiyun orr r9,r9,r12,lsl#24 220*4882a593Smuzhiyun#else 221*4882a593Smuzhiyun ldr r9,[r1],#4 @ handles unaligned 222*4882a593Smuzhiyun add r7,r8,r7,ror#2 @ E+=K_00_19 223*4882a593Smuzhiyun eor r10,r5,r6 @ F_xx_xx 224*4882a593Smuzhiyun add r7,r7,r3,ror#27 @ E+=ROR(A,27) 225*4882a593Smuzhiyun#ifdef __ARMEL__ 226*4882a593Smuzhiyun rev r9,r9 @ byte swap 227*4882a593Smuzhiyun#endif 228*4882a593Smuzhiyun#endif 229*4882a593Smuzhiyun and r10,r4,r10,ror#2 230*4882a593Smuzhiyun add r7,r7,r9 @ E+=X[i] 231*4882a593Smuzhiyun eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 232*4882a593Smuzhiyun str r9,[r14,#-4]! 233*4882a593Smuzhiyun add r7,r7,r10 @ E+=F_00_19(B,C,D) 234*4882a593Smuzhiyun ldr r9,[r14,#15*4] 235*4882a593Smuzhiyun ldr r10,[r14,#13*4] 236*4882a593Smuzhiyun ldr r11,[r14,#7*4] 237*4882a593Smuzhiyun add r6,r8,r6,ror#2 @ E+=K_xx_xx 238*4882a593Smuzhiyun ldr r12,[r14,#2*4] 239*4882a593Smuzhiyun eor r9,r9,r10 240*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 241*4882a593Smuzhiyun eor r10,r4,r5 @ F_xx_xx 242*4882a593Smuzhiyun mov r9,r9,ror#31 243*4882a593Smuzhiyun add r6,r6,r7,ror#27 @ E+=ROR(A,27) 244*4882a593Smuzhiyun eor r9,r9,r11,ror#31 245*4882a593Smuzhiyun str r9,[r14,#-4]! 246*4882a593Smuzhiyun and r10,r3,r10,ror#2 @ F_xx_xx 247*4882a593Smuzhiyun @ F_xx_xx 248*4882a593Smuzhiyun add r6,r6,r9 @ E+=X[i] 249*4882a593Smuzhiyun eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 250*4882a593Smuzhiyun add r6,r6,r10 @ E+=F_00_19(B,C,D) 251*4882a593Smuzhiyun ldr r9,[r14,#15*4] 252*4882a593Smuzhiyun ldr r10,[r14,#13*4] 253*4882a593Smuzhiyun ldr r11,[r14,#7*4] 254*4882a593Smuzhiyun add r5,r8,r5,ror#2 @ E+=K_xx_xx 255*4882a593Smuzhiyun ldr r12,[r14,#2*4] 256*4882a593Smuzhiyun eor r9,r9,r10 257*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 258*4882a593Smuzhiyun eor r10,r3,r4 @ F_xx_xx 259*4882a593Smuzhiyun mov r9,r9,ror#31 260*4882a593Smuzhiyun add r5,r5,r6,ror#27 @ E+=ROR(A,27) 261*4882a593Smuzhiyun eor r9,r9,r11,ror#31 262*4882a593Smuzhiyun str r9,[r14,#-4]! 263*4882a593Smuzhiyun and r10,r7,r10,ror#2 @ F_xx_xx 264*4882a593Smuzhiyun @ F_xx_xx 265*4882a593Smuzhiyun add r5,r5,r9 @ E+=X[i] 266*4882a593Smuzhiyun eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 267*4882a593Smuzhiyun add r5,r5,r10 @ E+=F_00_19(B,C,D) 268*4882a593Smuzhiyun ldr r9,[r14,#15*4] 269*4882a593Smuzhiyun ldr r10,[r14,#13*4] 270*4882a593Smuzhiyun ldr r11,[r14,#7*4] 271*4882a593Smuzhiyun add r4,r8,r4,ror#2 @ E+=K_xx_xx 272*4882a593Smuzhiyun ldr r12,[r14,#2*4] 273*4882a593Smuzhiyun eor r9,r9,r10 274*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 275*4882a593Smuzhiyun eor r10,r7,r3 @ F_xx_xx 276*4882a593Smuzhiyun mov r9,r9,ror#31 277*4882a593Smuzhiyun add r4,r4,r5,ror#27 @ E+=ROR(A,27) 278*4882a593Smuzhiyun eor r9,r9,r11,ror#31 279*4882a593Smuzhiyun str r9,[r14,#-4]! 280*4882a593Smuzhiyun and r10,r6,r10,ror#2 @ F_xx_xx 281*4882a593Smuzhiyun @ F_xx_xx 282*4882a593Smuzhiyun add r4,r4,r9 @ E+=X[i] 283*4882a593Smuzhiyun eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 284*4882a593Smuzhiyun add r4,r4,r10 @ E+=F_00_19(B,C,D) 285*4882a593Smuzhiyun ldr r9,[r14,#15*4] 286*4882a593Smuzhiyun ldr r10,[r14,#13*4] 287*4882a593Smuzhiyun ldr r11,[r14,#7*4] 288*4882a593Smuzhiyun add r3,r8,r3,ror#2 @ E+=K_xx_xx 289*4882a593Smuzhiyun ldr r12,[r14,#2*4] 290*4882a593Smuzhiyun eor r9,r9,r10 291*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 292*4882a593Smuzhiyun eor r10,r6,r7 @ F_xx_xx 293*4882a593Smuzhiyun mov r9,r9,ror#31 294*4882a593Smuzhiyun add r3,r3,r4,ror#27 @ E+=ROR(A,27) 295*4882a593Smuzhiyun eor r9,r9,r11,ror#31 296*4882a593Smuzhiyun str r9,[r14,#-4]! 297*4882a593Smuzhiyun and r10,r5,r10,ror#2 @ F_xx_xx 298*4882a593Smuzhiyun @ F_xx_xx 299*4882a593Smuzhiyun add r3,r3,r9 @ E+=X[i] 300*4882a593Smuzhiyun eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 301*4882a593Smuzhiyun add r3,r3,r10 @ E+=F_00_19(B,C,D) 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun ldr r8,.LK_20_39 @ [+15+16*4] 304*4882a593Smuzhiyun cmn sp,#0 @ [+3], clear carry to denote 20_39 305*4882a593Smuzhiyun.L_20_39_or_60_79: 306*4882a593Smuzhiyun ldr r9,[r14,#15*4] 307*4882a593Smuzhiyun ldr r10,[r14,#13*4] 308*4882a593Smuzhiyun ldr r11,[r14,#7*4] 309*4882a593Smuzhiyun add r7,r8,r7,ror#2 @ E+=K_xx_xx 310*4882a593Smuzhiyun ldr r12,[r14,#2*4] 311*4882a593Smuzhiyun eor r9,r9,r10 312*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 313*4882a593Smuzhiyun eor r10,r5,r6 @ F_xx_xx 314*4882a593Smuzhiyun mov r9,r9,ror#31 315*4882a593Smuzhiyun add r7,r7,r3,ror#27 @ E+=ROR(A,27) 316*4882a593Smuzhiyun eor r9,r9,r11,ror#31 317*4882a593Smuzhiyun str r9,[r14,#-4]! 318*4882a593Smuzhiyun eor r10,r4,r10,ror#2 @ F_xx_xx 319*4882a593Smuzhiyun @ F_xx_xx 320*4882a593Smuzhiyun add r7,r7,r9 @ E+=X[i] 321*4882a593Smuzhiyun add r7,r7,r10 @ E+=F_20_39(B,C,D) 322*4882a593Smuzhiyun ldr r9,[r14,#15*4] 323*4882a593Smuzhiyun ldr r10,[r14,#13*4] 324*4882a593Smuzhiyun ldr r11,[r14,#7*4] 325*4882a593Smuzhiyun add r6,r8,r6,ror#2 @ E+=K_xx_xx 326*4882a593Smuzhiyun ldr r12,[r14,#2*4] 327*4882a593Smuzhiyun eor r9,r9,r10 328*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 329*4882a593Smuzhiyun eor r10,r4,r5 @ F_xx_xx 330*4882a593Smuzhiyun mov r9,r9,ror#31 331*4882a593Smuzhiyun add r6,r6,r7,ror#27 @ E+=ROR(A,27) 332*4882a593Smuzhiyun eor r9,r9,r11,ror#31 333*4882a593Smuzhiyun str r9,[r14,#-4]! 334*4882a593Smuzhiyun eor r10,r3,r10,ror#2 @ F_xx_xx 335*4882a593Smuzhiyun @ F_xx_xx 336*4882a593Smuzhiyun add r6,r6,r9 @ E+=X[i] 337*4882a593Smuzhiyun add r6,r6,r10 @ E+=F_20_39(B,C,D) 338*4882a593Smuzhiyun ldr r9,[r14,#15*4] 339*4882a593Smuzhiyun ldr r10,[r14,#13*4] 340*4882a593Smuzhiyun ldr r11,[r14,#7*4] 341*4882a593Smuzhiyun add r5,r8,r5,ror#2 @ E+=K_xx_xx 342*4882a593Smuzhiyun ldr r12,[r14,#2*4] 343*4882a593Smuzhiyun eor r9,r9,r10 344*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 345*4882a593Smuzhiyun eor r10,r3,r4 @ F_xx_xx 346*4882a593Smuzhiyun mov r9,r9,ror#31 347*4882a593Smuzhiyun add r5,r5,r6,ror#27 @ E+=ROR(A,27) 348*4882a593Smuzhiyun eor r9,r9,r11,ror#31 349*4882a593Smuzhiyun str r9,[r14,#-4]! 350*4882a593Smuzhiyun eor r10,r7,r10,ror#2 @ F_xx_xx 351*4882a593Smuzhiyun @ F_xx_xx 352*4882a593Smuzhiyun add r5,r5,r9 @ E+=X[i] 353*4882a593Smuzhiyun add r5,r5,r10 @ E+=F_20_39(B,C,D) 354*4882a593Smuzhiyun ldr r9,[r14,#15*4] 355*4882a593Smuzhiyun ldr r10,[r14,#13*4] 356*4882a593Smuzhiyun ldr r11,[r14,#7*4] 357*4882a593Smuzhiyun add r4,r8,r4,ror#2 @ E+=K_xx_xx 358*4882a593Smuzhiyun ldr r12,[r14,#2*4] 359*4882a593Smuzhiyun eor r9,r9,r10 360*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 361*4882a593Smuzhiyun eor r10,r7,r3 @ F_xx_xx 362*4882a593Smuzhiyun mov r9,r9,ror#31 363*4882a593Smuzhiyun add r4,r4,r5,ror#27 @ E+=ROR(A,27) 364*4882a593Smuzhiyun eor r9,r9,r11,ror#31 365*4882a593Smuzhiyun str r9,[r14,#-4]! 366*4882a593Smuzhiyun eor r10,r6,r10,ror#2 @ F_xx_xx 367*4882a593Smuzhiyun @ F_xx_xx 368*4882a593Smuzhiyun add r4,r4,r9 @ E+=X[i] 369*4882a593Smuzhiyun add r4,r4,r10 @ E+=F_20_39(B,C,D) 370*4882a593Smuzhiyun ldr r9,[r14,#15*4] 371*4882a593Smuzhiyun ldr r10,[r14,#13*4] 372*4882a593Smuzhiyun ldr r11,[r14,#7*4] 373*4882a593Smuzhiyun add r3,r8,r3,ror#2 @ E+=K_xx_xx 374*4882a593Smuzhiyun ldr r12,[r14,#2*4] 375*4882a593Smuzhiyun eor r9,r9,r10 376*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 377*4882a593Smuzhiyun eor r10,r6,r7 @ F_xx_xx 378*4882a593Smuzhiyun mov r9,r9,ror#31 379*4882a593Smuzhiyun add r3,r3,r4,ror#27 @ E+=ROR(A,27) 380*4882a593Smuzhiyun eor r9,r9,r11,ror#31 381*4882a593Smuzhiyun str r9,[r14,#-4]! 382*4882a593Smuzhiyun eor r10,r5,r10,ror#2 @ F_xx_xx 383*4882a593Smuzhiyun @ F_xx_xx 384*4882a593Smuzhiyun add r3,r3,r9 @ E+=X[i] 385*4882a593Smuzhiyun add r3,r3,r10 @ E+=F_20_39(B,C,D) 386*4882a593Smuzhiyun ARM( teq r14,sp ) @ preserve carry 387*4882a593Smuzhiyun THUMB( mov r11,sp ) 388*4882a593Smuzhiyun THUMB( teq r14,r11 ) @ preserve carry 389*4882a593Smuzhiyun bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 390*4882a593Smuzhiyun bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 391*4882a593Smuzhiyun 392*4882a593Smuzhiyun ldr r8,.LK_40_59 393*4882a593Smuzhiyun sub sp,sp,#20*4 @ [+2] 394*4882a593Smuzhiyun.L_40_59: 395*4882a593Smuzhiyun ldr r9,[r14,#15*4] 396*4882a593Smuzhiyun ldr r10,[r14,#13*4] 397*4882a593Smuzhiyun ldr r11,[r14,#7*4] 398*4882a593Smuzhiyun add r7,r8,r7,ror#2 @ E+=K_xx_xx 399*4882a593Smuzhiyun ldr r12,[r14,#2*4] 400*4882a593Smuzhiyun eor r9,r9,r10 401*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 402*4882a593Smuzhiyun eor r10,r5,r6 @ F_xx_xx 403*4882a593Smuzhiyun mov r9,r9,ror#31 404*4882a593Smuzhiyun add r7,r7,r3,ror#27 @ E+=ROR(A,27) 405*4882a593Smuzhiyun eor r9,r9,r11,ror#31 406*4882a593Smuzhiyun str r9,[r14,#-4]! 407*4882a593Smuzhiyun and r10,r4,r10,ror#2 @ F_xx_xx 408*4882a593Smuzhiyun and r11,r5,r6 @ F_xx_xx 409*4882a593Smuzhiyun add r7,r7,r9 @ E+=X[i] 410*4882a593Smuzhiyun add r7,r7,r10 @ E+=F_40_59(B,C,D) 411*4882a593Smuzhiyun add r7,r7,r11,ror#2 412*4882a593Smuzhiyun ldr r9,[r14,#15*4] 413*4882a593Smuzhiyun ldr r10,[r14,#13*4] 414*4882a593Smuzhiyun ldr r11,[r14,#7*4] 415*4882a593Smuzhiyun add r6,r8,r6,ror#2 @ E+=K_xx_xx 416*4882a593Smuzhiyun ldr r12,[r14,#2*4] 417*4882a593Smuzhiyun eor r9,r9,r10 418*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 419*4882a593Smuzhiyun eor r10,r4,r5 @ F_xx_xx 420*4882a593Smuzhiyun mov r9,r9,ror#31 421*4882a593Smuzhiyun add r6,r6,r7,ror#27 @ E+=ROR(A,27) 422*4882a593Smuzhiyun eor r9,r9,r11,ror#31 423*4882a593Smuzhiyun str r9,[r14,#-4]! 424*4882a593Smuzhiyun and r10,r3,r10,ror#2 @ F_xx_xx 425*4882a593Smuzhiyun and r11,r4,r5 @ F_xx_xx 426*4882a593Smuzhiyun add r6,r6,r9 @ E+=X[i] 427*4882a593Smuzhiyun add r6,r6,r10 @ E+=F_40_59(B,C,D) 428*4882a593Smuzhiyun add r6,r6,r11,ror#2 429*4882a593Smuzhiyun ldr r9,[r14,#15*4] 430*4882a593Smuzhiyun ldr r10,[r14,#13*4] 431*4882a593Smuzhiyun ldr r11,[r14,#7*4] 432*4882a593Smuzhiyun add r5,r8,r5,ror#2 @ E+=K_xx_xx 433*4882a593Smuzhiyun ldr r12,[r14,#2*4] 434*4882a593Smuzhiyun eor r9,r9,r10 435*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 436*4882a593Smuzhiyun eor r10,r3,r4 @ F_xx_xx 437*4882a593Smuzhiyun mov r9,r9,ror#31 438*4882a593Smuzhiyun add r5,r5,r6,ror#27 @ E+=ROR(A,27) 439*4882a593Smuzhiyun eor r9,r9,r11,ror#31 440*4882a593Smuzhiyun str r9,[r14,#-4]! 441*4882a593Smuzhiyun and r10,r7,r10,ror#2 @ F_xx_xx 442*4882a593Smuzhiyun and r11,r3,r4 @ F_xx_xx 443*4882a593Smuzhiyun add r5,r5,r9 @ E+=X[i] 444*4882a593Smuzhiyun add r5,r5,r10 @ E+=F_40_59(B,C,D) 445*4882a593Smuzhiyun add r5,r5,r11,ror#2 446*4882a593Smuzhiyun ldr r9,[r14,#15*4] 447*4882a593Smuzhiyun ldr r10,[r14,#13*4] 448*4882a593Smuzhiyun ldr r11,[r14,#7*4] 449*4882a593Smuzhiyun add r4,r8,r4,ror#2 @ E+=K_xx_xx 450*4882a593Smuzhiyun ldr r12,[r14,#2*4] 451*4882a593Smuzhiyun eor r9,r9,r10 452*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 453*4882a593Smuzhiyun eor r10,r7,r3 @ F_xx_xx 454*4882a593Smuzhiyun mov r9,r9,ror#31 455*4882a593Smuzhiyun add r4,r4,r5,ror#27 @ E+=ROR(A,27) 456*4882a593Smuzhiyun eor r9,r9,r11,ror#31 457*4882a593Smuzhiyun str r9,[r14,#-4]! 458*4882a593Smuzhiyun and r10,r6,r10,ror#2 @ F_xx_xx 459*4882a593Smuzhiyun and r11,r7,r3 @ F_xx_xx 460*4882a593Smuzhiyun add r4,r4,r9 @ E+=X[i] 461*4882a593Smuzhiyun add r4,r4,r10 @ E+=F_40_59(B,C,D) 462*4882a593Smuzhiyun add r4,r4,r11,ror#2 463*4882a593Smuzhiyun ldr r9,[r14,#15*4] 464*4882a593Smuzhiyun ldr r10,[r14,#13*4] 465*4882a593Smuzhiyun ldr r11,[r14,#7*4] 466*4882a593Smuzhiyun add r3,r8,r3,ror#2 @ E+=K_xx_xx 467*4882a593Smuzhiyun ldr r12,[r14,#2*4] 468*4882a593Smuzhiyun eor r9,r9,r10 469*4882a593Smuzhiyun eor r11,r11,r12 @ 1 cycle stall 470*4882a593Smuzhiyun eor r10,r6,r7 @ F_xx_xx 471*4882a593Smuzhiyun mov r9,r9,ror#31 472*4882a593Smuzhiyun add r3,r3,r4,ror#27 @ E+=ROR(A,27) 473*4882a593Smuzhiyun eor r9,r9,r11,ror#31 474*4882a593Smuzhiyun str r9,[r14,#-4]! 475*4882a593Smuzhiyun and r10,r5,r10,ror#2 @ F_xx_xx 476*4882a593Smuzhiyun and r11,r6,r7 @ F_xx_xx 477*4882a593Smuzhiyun add r3,r3,r9 @ E+=X[i] 478*4882a593Smuzhiyun add r3,r3,r10 @ E+=F_40_59(B,C,D) 479*4882a593Smuzhiyun add r3,r3,r11,ror#2 480*4882a593Smuzhiyun cmp r14,sp 481*4882a593Smuzhiyun bne .L_40_59 @ [+((12+5)*5+2)*4] 482*4882a593Smuzhiyun 483*4882a593Smuzhiyun ldr r8,.LK_60_79 484*4882a593Smuzhiyun sub sp,sp,#20*4 485*4882a593Smuzhiyun cmp sp,#0 @ set carry to denote 60_79 486*4882a593Smuzhiyun b .L_20_39_or_60_79 @ [+4], spare 300 bytes 487*4882a593Smuzhiyun.L_done: 488*4882a593Smuzhiyun add sp,sp,#80*4 @ "deallocate" stack frame 489*4882a593Smuzhiyun ldmia r0,{r8,r9,r10,r11,r12} 490*4882a593Smuzhiyun add r3,r8,r3 491*4882a593Smuzhiyun add r4,r9,r4 492*4882a593Smuzhiyun add r5,r10,r5,ror#2 493*4882a593Smuzhiyun add r6,r11,r6,ror#2 494*4882a593Smuzhiyun add r7,r12,r7,ror#2 495*4882a593Smuzhiyun stmia r0,{r3,r4,r5,r6,r7} 496*4882a593Smuzhiyun teq r1,r2 497*4882a593Smuzhiyun bne .Lloop @ [+18], total 1307 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun ldmia sp!,{r4-r12,pc} 500*4882a593Smuzhiyun.align 2 501*4882a593Smuzhiyun.LK_00_19: .word 0x5a827999 502*4882a593Smuzhiyun.LK_20_39: .word 0x6ed9eba1 503*4882a593Smuzhiyun.LK_40_59: .word 0x8f1bbcdc 504*4882a593Smuzhiyun.LK_60_79: .word 0xca62c1d6 505*4882a593SmuzhiyunENDPROC(sha1_block_data_order) 506*4882a593Smuzhiyun.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" 507*4882a593Smuzhiyun.align 2 508