1*4882a593Smuzhiyun#!/usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# ==================================================================== 5*4882a593Smuzhiyun# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 6*4882a593Smuzhiyun# project. 7*4882a593Smuzhiyun# ==================================================================== 8*4882a593Smuzhiyun# 9*4882a593Smuzhiyun# IALU(*)/gcc-4.4 NEON 10*4882a593Smuzhiyun# 11*4882a593Smuzhiyun# ARM11xx(ARMv6) 7.78/+100% - 12*4882a593Smuzhiyun# Cortex-A5 6.35/+130% 3.00 13*4882a593Smuzhiyun# Cortex-A8 6.25/+115% 2.36 14*4882a593Smuzhiyun# Cortex-A9 5.10/+95% 2.55 15*4882a593Smuzhiyun# Cortex-A15 3.85/+85% 1.25(**) 16*4882a593Smuzhiyun# Snapdragon S4 5.70/+100% 1.48(**) 17*4882a593Smuzhiyun# 18*4882a593Smuzhiyun# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; 19*4882a593Smuzhiyun# (**) these are trade-off results, they can be improved by ~8% but at 20*4882a593Smuzhiyun# the cost of 15/12% regression on Cortex-A5/A7, it's even possible 21*4882a593Smuzhiyun# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun$flavour = shift; 24*4882a593Smuzhiyunif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 25*4882a593Smuzhiyunelse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 26*4882a593Smuzhiyun 27*4882a593Smuzhiyunif ($flavour && $flavour ne "void") { 28*4882a593Smuzhiyun $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29*4882a593Smuzhiyun ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 30*4882a593Smuzhiyun ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 31*4882a593Smuzhiyun die "can't locate arm-xlate.pl"; 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun open STDOUT,"| \"$^X\" $xlate $flavour $output"; 34*4882a593Smuzhiyun} else { 35*4882a593Smuzhiyun open STDOUT,">$output"; 36*4882a593Smuzhiyun} 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun$code.=<<___; 41*4882a593Smuzhiyun#ifndef __KERNEL__ 42*4882a593Smuzhiyun# include "arm_arch.h" 43*4882a593Smuzhiyun#else 44*4882a593Smuzhiyun# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 45*4882a593Smuzhiyun# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ 46*4882a593Smuzhiyun# define poly1305_init poly1305_init_arm 47*4882a593Smuzhiyun# define poly1305_blocks poly1305_blocks_arm 48*4882a593Smuzhiyun# define poly1305_emit poly1305_emit_arm 49*4882a593Smuzhiyun.globl poly1305_blocks_neon 50*4882a593Smuzhiyun#endif 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun#if defined(__thumb2__) 53*4882a593Smuzhiyun.syntax unified 54*4882a593Smuzhiyun.thumb 55*4882a593Smuzhiyun#else 56*4882a593Smuzhiyun.code 32 57*4882a593Smuzhiyun#endif 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun.text 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun.globl poly1305_emit 62*4882a593Smuzhiyun.globl poly1305_blocks 63*4882a593Smuzhiyun.globl poly1305_init 64*4882a593Smuzhiyun.type poly1305_init,%function 65*4882a593Smuzhiyun.align 5 66*4882a593Smuzhiyunpoly1305_init: 67*4882a593Smuzhiyun.Lpoly1305_init: 68*4882a593Smuzhiyun stmdb sp!,{r4-r11} 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun eor r3,r3,r3 71*4882a593Smuzhiyun cmp $inp,#0 72*4882a593Smuzhiyun str r3,[$ctx,#0] @ zero hash value 73*4882a593Smuzhiyun str r3,[$ctx,#4] 74*4882a593Smuzhiyun str r3,[$ctx,#8] 75*4882a593Smuzhiyun str r3,[$ctx,#12] 76*4882a593Smuzhiyun str r3,[$ctx,#16] 77*4882a593Smuzhiyun str r3,[$ctx,#36] @ clear is_base2_26 78*4882a593Smuzhiyun add $ctx,$ctx,#20 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun#ifdef __thumb2__ 81*4882a593Smuzhiyun it eq 82*4882a593Smuzhiyun#endif 83*4882a593Smuzhiyun moveq r0,#0 84*4882a593Smuzhiyun beq .Lno_key 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 87*4882a593Smuzhiyun mov r3,#-1 88*4882a593Smuzhiyun str r3,[$ctx,#28] @ impossible key power value 89*4882a593Smuzhiyun# ifndef __KERNEL__ 90*4882a593Smuzhiyun adr r11,.Lpoly1305_init 91*4882a593Smuzhiyun ldr r12,.LOPENSSL_armcap 92*4882a593Smuzhiyun# endif 93*4882a593Smuzhiyun#endif 94*4882a593Smuzhiyun ldrb r4,[$inp,#0] 95*4882a593Smuzhiyun mov r10,#0x0fffffff 96*4882a593Smuzhiyun ldrb r5,[$inp,#1] 97*4882a593Smuzhiyun and r3,r10,#-4 @ 0x0ffffffc 98*4882a593Smuzhiyun ldrb r6,[$inp,#2] 99*4882a593Smuzhiyun ldrb r7,[$inp,#3] 100*4882a593Smuzhiyun orr r4,r4,r5,lsl#8 101*4882a593Smuzhiyun ldrb r5,[$inp,#4] 102*4882a593Smuzhiyun orr r4,r4,r6,lsl#16 103*4882a593Smuzhiyun ldrb r6,[$inp,#5] 104*4882a593Smuzhiyun orr r4,r4,r7,lsl#24 105*4882a593Smuzhiyun ldrb r7,[$inp,#6] 106*4882a593Smuzhiyun and r4,r4,r10 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 109*4882a593Smuzhiyun# if !defined(_WIN32) 110*4882a593Smuzhiyun ldr r12,[r11,r12] @ OPENSSL_armcap_P 111*4882a593Smuzhiyun# endif 112*4882a593Smuzhiyun# if defined(__APPLE__) || defined(_WIN32) 113*4882a593Smuzhiyun ldr r12,[r12] 114*4882a593Smuzhiyun# endif 115*4882a593Smuzhiyun#endif 116*4882a593Smuzhiyun ldrb r8,[$inp,#7] 117*4882a593Smuzhiyun orr r5,r5,r6,lsl#8 118*4882a593Smuzhiyun ldrb r6,[$inp,#8] 119*4882a593Smuzhiyun orr r5,r5,r7,lsl#16 120*4882a593Smuzhiyun ldrb r7,[$inp,#9] 121*4882a593Smuzhiyun orr r5,r5,r8,lsl#24 122*4882a593Smuzhiyun ldrb r8,[$inp,#10] 123*4882a593Smuzhiyun and r5,r5,r3 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 126*4882a593Smuzhiyun tst r12,#ARMV7_NEON @ check for NEON 127*4882a593Smuzhiyun# ifdef __thumb2__ 128*4882a593Smuzhiyun adr r9,.Lpoly1305_blocks_neon 129*4882a593Smuzhiyun adr r11,.Lpoly1305_blocks 130*4882a593Smuzhiyun it ne 131*4882a593Smuzhiyun movne r11,r9 132*4882a593Smuzhiyun adr r12,.Lpoly1305_emit 133*4882a593Smuzhiyun orr r11,r11,#1 @ thumb-ify addresses 134*4882a593Smuzhiyun orr r12,r12,#1 135*4882a593Smuzhiyun# else 136*4882a593Smuzhiyun add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 137*4882a593Smuzhiyun ite eq 138*4882a593Smuzhiyun addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 139*4882a593Smuzhiyun addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 140*4882a593Smuzhiyun# endif 141*4882a593Smuzhiyun#endif 142*4882a593Smuzhiyun ldrb r9,[$inp,#11] 143*4882a593Smuzhiyun orr r6,r6,r7,lsl#8 144*4882a593Smuzhiyun ldrb r7,[$inp,#12] 145*4882a593Smuzhiyun orr r6,r6,r8,lsl#16 146*4882a593Smuzhiyun ldrb r8,[$inp,#13] 147*4882a593Smuzhiyun orr r6,r6,r9,lsl#24 148*4882a593Smuzhiyun ldrb r9,[$inp,#14] 149*4882a593Smuzhiyun and r6,r6,r3 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun ldrb r10,[$inp,#15] 152*4882a593Smuzhiyun orr r7,r7,r8,lsl#8 153*4882a593Smuzhiyun str r4,[$ctx,#0] 154*4882a593Smuzhiyun orr r7,r7,r9,lsl#16 155*4882a593Smuzhiyun str r5,[$ctx,#4] 156*4882a593Smuzhiyun orr r7,r7,r10,lsl#24 157*4882a593Smuzhiyun str r6,[$ctx,#8] 158*4882a593Smuzhiyun and r7,r7,r3 159*4882a593Smuzhiyun str r7,[$ctx,#12] 160*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 161*4882a593Smuzhiyun stmia r2,{r11,r12} @ fill functions table 162*4882a593Smuzhiyun mov r0,#1 163*4882a593Smuzhiyun#else 164*4882a593Smuzhiyun mov r0,#0 165*4882a593Smuzhiyun#endif 166*4882a593Smuzhiyun.Lno_key: 167*4882a593Smuzhiyun ldmia sp!,{r4-r11} 168*4882a593Smuzhiyun#if __ARM_ARCH__>=5 169*4882a593Smuzhiyun ret @ bx lr 170*4882a593Smuzhiyun#else 171*4882a593Smuzhiyun tst lr,#1 172*4882a593Smuzhiyun moveq pc,lr @ be binary compatible with V4, yet 173*4882a593Smuzhiyun bx lr @ interoperable with Thumb ISA:-) 174*4882a593Smuzhiyun#endif 175*4882a593Smuzhiyun.size poly1305_init,.-poly1305_init 176*4882a593Smuzhiyun___ 177*4882a593Smuzhiyun{ 178*4882a593Smuzhiyunmy ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); 179*4882a593Smuzhiyunmy ($s1,$s2,$s3)=($r1,$r2,$r3); 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun$code.=<<___; 182*4882a593Smuzhiyun.type poly1305_blocks,%function 183*4882a593Smuzhiyun.align 5 184*4882a593Smuzhiyunpoly1305_blocks: 185*4882a593Smuzhiyun.Lpoly1305_blocks: 186*4882a593Smuzhiyun stmdb sp!,{r3-r11,lr} 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun ands $len,$len,#-16 189*4882a593Smuzhiyun beq .Lno_data 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun add $len,$len,$inp @ end pointer 192*4882a593Smuzhiyun sub sp,sp,#32 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun#if __ARM_ARCH__<7 195*4882a593Smuzhiyun ldmia $ctx,{$h0-$r3} @ load context 196*4882a593Smuzhiyun add $ctx,$ctx,#20 197*4882a593Smuzhiyun str $len,[sp,#16] @ offload stuff 198*4882a593Smuzhiyun str $ctx,[sp,#12] 199*4882a593Smuzhiyun#else 200*4882a593Smuzhiyun ldr lr,[$ctx,#36] @ is_base2_26 201*4882a593Smuzhiyun ldmia $ctx!,{$h0-$h4} @ load hash value 202*4882a593Smuzhiyun str $len,[sp,#16] @ offload stuff 203*4882a593Smuzhiyun str $ctx,[sp,#12] 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 206*4882a593Smuzhiyun mov $r1,$h1,lsr#6 207*4882a593Smuzhiyun adcs $r1,$r1,$h2,lsl#20 208*4882a593Smuzhiyun mov $r2,$h2,lsr#12 209*4882a593Smuzhiyun adcs $r2,$r2,$h3,lsl#14 210*4882a593Smuzhiyun mov $r3,$h3,lsr#18 211*4882a593Smuzhiyun adcs $r3,$r3,$h4,lsl#8 212*4882a593Smuzhiyun mov $len,#0 213*4882a593Smuzhiyun teq lr,#0 214*4882a593Smuzhiyun str $len,[$ctx,#16] @ clear is_base2_26 215*4882a593Smuzhiyun adc $len,$len,$h4,lsr#24 216*4882a593Smuzhiyun 217*4882a593Smuzhiyun itttt ne 218*4882a593Smuzhiyun movne $h0,$r0 @ choose between radixes 219*4882a593Smuzhiyun movne $h1,$r1 220*4882a593Smuzhiyun movne $h2,$r2 221*4882a593Smuzhiyun movne $h3,$r3 222*4882a593Smuzhiyun ldmia $ctx,{$r0-$r3} @ load key 223*4882a593Smuzhiyun it ne 224*4882a593Smuzhiyun movne $h4,$len 225*4882a593Smuzhiyun#endif 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun mov lr,$inp 228*4882a593Smuzhiyun cmp $padbit,#0 229*4882a593Smuzhiyun str $r1,[sp,#20] 230*4882a593Smuzhiyun str $r2,[sp,#24] 231*4882a593Smuzhiyun str $r3,[sp,#28] 232*4882a593Smuzhiyun b .Loop 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun.align 4 235*4882a593Smuzhiyun.Loop: 236*4882a593Smuzhiyun#if __ARM_ARCH__<7 237*4882a593Smuzhiyun ldrb r0,[lr],#16 @ load input 238*4882a593Smuzhiyun# ifdef __thumb2__ 239*4882a593Smuzhiyun it hi 240*4882a593Smuzhiyun# endif 241*4882a593Smuzhiyun addhi $h4,$h4,#1 @ 1<<128 242*4882a593Smuzhiyun ldrb r1,[lr,#-15] 243*4882a593Smuzhiyun ldrb r2,[lr,#-14] 244*4882a593Smuzhiyun ldrb r3,[lr,#-13] 245*4882a593Smuzhiyun orr r1,r0,r1,lsl#8 246*4882a593Smuzhiyun ldrb r0,[lr,#-12] 247*4882a593Smuzhiyun orr r2,r1,r2,lsl#16 248*4882a593Smuzhiyun ldrb r1,[lr,#-11] 249*4882a593Smuzhiyun orr r3,r2,r3,lsl#24 250*4882a593Smuzhiyun ldrb r2,[lr,#-10] 251*4882a593Smuzhiyun adds $h0,$h0,r3 @ accumulate input 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun ldrb r3,[lr,#-9] 254*4882a593Smuzhiyun orr r1,r0,r1,lsl#8 255*4882a593Smuzhiyun ldrb r0,[lr,#-8] 256*4882a593Smuzhiyun orr r2,r1,r2,lsl#16 257*4882a593Smuzhiyun ldrb r1,[lr,#-7] 258*4882a593Smuzhiyun orr r3,r2,r3,lsl#24 259*4882a593Smuzhiyun ldrb r2,[lr,#-6] 260*4882a593Smuzhiyun adcs $h1,$h1,r3 261*4882a593Smuzhiyun 262*4882a593Smuzhiyun ldrb r3,[lr,#-5] 263*4882a593Smuzhiyun orr r1,r0,r1,lsl#8 264*4882a593Smuzhiyun ldrb r0,[lr,#-4] 265*4882a593Smuzhiyun orr r2,r1,r2,lsl#16 266*4882a593Smuzhiyun ldrb r1,[lr,#-3] 267*4882a593Smuzhiyun orr r3,r2,r3,lsl#24 268*4882a593Smuzhiyun ldrb r2,[lr,#-2] 269*4882a593Smuzhiyun adcs $h2,$h2,r3 270*4882a593Smuzhiyun 271*4882a593Smuzhiyun ldrb r3,[lr,#-1] 272*4882a593Smuzhiyun orr r1,r0,r1,lsl#8 273*4882a593Smuzhiyun str lr,[sp,#8] @ offload input pointer 274*4882a593Smuzhiyun orr r2,r1,r2,lsl#16 275*4882a593Smuzhiyun add $s1,$r1,$r1,lsr#2 276*4882a593Smuzhiyun orr r3,r2,r3,lsl#24 277*4882a593Smuzhiyun#else 278*4882a593Smuzhiyun ldr r0,[lr],#16 @ load input 279*4882a593Smuzhiyun it hi 280*4882a593Smuzhiyun addhi $h4,$h4,#1 @ padbit 281*4882a593Smuzhiyun ldr r1,[lr,#-12] 282*4882a593Smuzhiyun ldr r2,[lr,#-8] 283*4882a593Smuzhiyun ldr r3,[lr,#-4] 284*4882a593Smuzhiyun# ifdef __ARMEB__ 285*4882a593Smuzhiyun rev r0,r0 286*4882a593Smuzhiyun rev r1,r1 287*4882a593Smuzhiyun rev r2,r2 288*4882a593Smuzhiyun rev r3,r3 289*4882a593Smuzhiyun# endif 290*4882a593Smuzhiyun adds $h0,$h0,r0 @ accumulate input 291*4882a593Smuzhiyun str lr,[sp,#8] @ offload input pointer 292*4882a593Smuzhiyun adcs $h1,$h1,r1 293*4882a593Smuzhiyun add $s1,$r1,$r1,lsr#2 294*4882a593Smuzhiyun adcs $h2,$h2,r2 295*4882a593Smuzhiyun#endif 296*4882a593Smuzhiyun add $s2,$r2,$r2,lsr#2 297*4882a593Smuzhiyun adcs $h3,$h3,r3 298*4882a593Smuzhiyun add $s3,$r3,$r3,lsr#2 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun umull r2,r3,$h1,$r0 301*4882a593Smuzhiyun adc $h4,$h4,#0 302*4882a593Smuzhiyun umull r0,r1,$h0,$r0 303*4882a593Smuzhiyun umlal r2,r3,$h4,$s1 304*4882a593Smuzhiyun umlal r0,r1,$h3,$s1 305*4882a593Smuzhiyun ldr $r1,[sp,#20] @ reload $r1 306*4882a593Smuzhiyun umlal r2,r3,$h2,$s3 307*4882a593Smuzhiyun umlal r0,r1,$h1,$s3 308*4882a593Smuzhiyun umlal r2,r3,$h3,$s2 309*4882a593Smuzhiyun umlal r0,r1,$h2,$s2 310*4882a593Smuzhiyun umlal r2,r3,$h0,$r1 311*4882a593Smuzhiyun str r0,[sp,#0] @ future $h0 312*4882a593Smuzhiyun mul r0,$s2,$h4 313*4882a593Smuzhiyun ldr $r2,[sp,#24] @ reload $r2 314*4882a593Smuzhiyun adds r2,r2,r1 @ d1+=d0>>32 315*4882a593Smuzhiyun eor r1,r1,r1 316*4882a593Smuzhiyun adc lr,r3,#0 @ future $h2 317*4882a593Smuzhiyun str r2,[sp,#4] @ future $h1 318*4882a593Smuzhiyun 319*4882a593Smuzhiyun mul r2,$s3,$h4 320*4882a593Smuzhiyun eor r3,r3,r3 321*4882a593Smuzhiyun umlal r0,r1,$h3,$s3 322*4882a593Smuzhiyun ldr $r3,[sp,#28] @ reload $r3 323*4882a593Smuzhiyun umlal r2,r3,$h3,$r0 324*4882a593Smuzhiyun umlal r0,r1,$h2,$r0 325*4882a593Smuzhiyun umlal r2,r3,$h2,$r1 326*4882a593Smuzhiyun umlal r0,r1,$h1,$r1 327*4882a593Smuzhiyun umlal r2,r3,$h1,$r2 328*4882a593Smuzhiyun umlal r0,r1,$h0,$r2 329*4882a593Smuzhiyun umlal r2,r3,$h0,$r3 330*4882a593Smuzhiyun ldr $h0,[sp,#0] 331*4882a593Smuzhiyun mul $h4,$r0,$h4 332*4882a593Smuzhiyun ldr $h1,[sp,#4] 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun adds $h2,lr,r0 @ d2+=d1>>32 335*4882a593Smuzhiyun ldr lr,[sp,#8] @ reload input pointer 336*4882a593Smuzhiyun adc r1,r1,#0 337*4882a593Smuzhiyun adds $h3,r2,r1 @ d3+=d2>>32 338*4882a593Smuzhiyun ldr r0,[sp,#16] @ reload end pointer 339*4882a593Smuzhiyun adc r3,r3,#0 340*4882a593Smuzhiyun add $h4,$h4,r3 @ h4+=d3>>32 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun and r1,$h4,#-4 343*4882a593Smuzhiyun and $h4,$h4,#3 344*4882a593Smuzhiyun add r1,r1,r1,lsr#2 @ *=5 345*4882a593Smuzhiyun adds $h0,$h0,r1 346*4882a593Smuzhiyun adcs $h1,$h1,#0 347*4882a593Smuzhiyun adcs $h2,$h2,#0 348*4882a593Smuzhiyun adcs $h3,$h3,#0 349*4882a593Smuzhiyun adc $h4,$h4,#0 350*4882a593Smuzhiyun 351*4882a593Smuzhiyun cmp r0,lr @ done yet? 352*4882a593Smuzhiyun bhi .Loop 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun ldr $ctx,[sp,#12] 355*4882a593Smuzhiyun add sp,sp,#32 356*4882a593Smuzhiyun stmdb $ctx,{$h0-$h4} @ store the result 357*4882a593Smuzhiyun 358*4882a593Smuzhiyun.Lno_data: 359*4882a593Smuzhiyun#if __ARM_ARCH__>=5 360*4882a593Smuzhiyun ldmia sp!,{r3-r11,pc} 361*4882a593Smuzhiyun#else 362*4882a593Smuzhiyun ldmia sp!,{r3-r11,lr} 363*4882a593Smuzhiyun tst lr,#1 364*4882a593Smuzhiyun moveq pc,lr @ be binary compatible with V4, yet 365*4882a593Smuzhiyun bx lr @ interoperable with Thumb ISA:-) 366*4882a593Smuzhiyun#endif 367*4882a593Smuzhiyun.size poly1305_blocks,.-poly1305_blocks 368*4882a593Smuzhiyun___ 369*4882a593Smuzhiyun} 370*4882a593Smuzhiyun{ 371*4882a593Smuzhiyunmy ($ctx,$mac,$nonce)=map("r$_",(0..2)); 372*4882a593Smuzhiyunmy ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); 373*4882a593Smuzhiyunmy $g4=$ctx; 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun$code.=<<___; 376*4882a593Smuzhiyun.type poly1305_emit,%function 377*4882a593Smuzhiyun.align 5 378*4882a593Smuzhiyunpoly1305_emit: 379*4882a593Smuzhiyun.Lpoly1305_emit: 380*4882a593Smuzhiyun stmdb sp!,{r4-r11} 381*4882a593Smuzhiyun 382*4882a593Smuzhiyun ldmia $ctx,{$h0-$h4} 383*4882a593Smuzhiyun 384*4882a593Smuzhiyun#if __ARM_ARCH__>=7 385*4882a593Smuzhiyun ldr ip,[$ctx,#36] @ is_base2_26 386*4882a593Smuzhiyun 387*4882a593Smuzhiyun adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 388*4882a593Smuzhiyun mov $g1,$h1,lsr#6 389*4882a593Smuzhiyun adcs $g1,$g1,$h2,lsl#20 390*4882a593Smuzhiyun mov $g2,$h2,lsr#12 391*4882a593Smuzhiyun adcs $g2,$g2,$h3,lsl#14 392*4882a593Smuzhiyun mov $g3,$h3,lsr#18 393*4882a593Smuzhiyun adcs $g3,$g3,$h4,lsl#8 394*4882a593Smuzhiyun mov $g4,#0 395*4882a593Smuzhiyun adc $g4,$g4,$h4,lsr#24 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun tst ip,ip 398*4882a593Smuzhiyun itttt ne 399*4882a593Smuzhiyun movne $h0,$g0 400*4882a593Smuzhiyun movne $h1,$g1 401*4882a593Smuzhiyun movne $h2,$g2 402*4882a593Smuzhiyun movne $h3,$g3 403*4882a593Smuzhiyun it ne 404*4882a593Smuzhiyun movne $h4,$g4 405*4882a593Smuzhiyun#endif 406*4882a593Smuzhiyun 407*4882a593Smuzhiyun adds $g0,$h0,#5 @ compare to modulus 408*4882a593Smuzhiyun adcs $g1,$h1,#0 409*4882a593Smuzhiyun adcs $g2,$h2,#0 410*4882a593Smuzhiyun adcs $g3,$h3,#0 411*4882a593Smuzhiyun adc $g4,$h4,#0 412*4882a593Smuzhiyun tst $g4,#4 @ did it carry/borrow? 413*4882a593Smuzhiyun 414*4882a593Smuzhiyun#ifdef __thumb2__ 415*4882a593Smuzhiyun it ne 416*4882a593Smuzhiyun#endif 417*4882a593Smuzhiyun movne $h0,$g0 418*4882a593Smuzhiyun ldr $g0,[$nonce,#0] 419*4882a593Smuzhiyun#ifdef __thumb2__ 420*4882a593Smuzhiyun it ne 421*4882a593Smuzhiyun#endif 422*4882a593Smuzhiyun movne $h1,$g1 423*4882a593Smuzhiyun ldr $g1,[$nonce,#4] 424*4882a593Smuzhiyun#ifdef __thumb2__ 425*4882a593Smuzhiyun it ne 426*4882a593Smuzhiyun#endif 427*4882a593Smuzhiyun movne $h2,$g2 428*4882a593Smuzhiyun ldr $g2,[$nonce,#8] 429*4882a593Smuzhiyun#ifdef __thumb2__ 430*4882a593Smuzhiyun it ne 431*4882a593Smuzhiyun#endif 432*4882a593Smuzhiyun movne $h3,$g3 433*4882a593Smuzhiyun ldr $g3,[$nonce,#12] 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun adds $h0,$h0,$g0 436*4882a593Smuzhiyun adcs $h1,$h1,$g1 437*4882a593Smuzhiyun adcs $h2,$h2,$g2 438*4882a593Smuzhiyun adc $h3,$h3,$g3 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun#if __ARM_ARCH__>=7 441*4882a593Smuzhiyun# ifdef __ARMEB__ 442*4882a593Smuzhiyun rev $h0,$h0 443*4882a593Smuzhiyun rev $h1,$h1 444*4882a593Smuzhiyun rev $h2,$h2 445*4882a593Smuzhiyun rev $h3,$h3 446*4882a593Smuzhiyun# endif 447*4882a593Smuzhiyun str $h0,[$mac,#0] 448*4882a593Smuzhiyun str $h1,[$mac,#4] 449*4882a593Smuzhiyun str $h2,[$mac,#8] 450*4882a593Smuzhiyun str $h3,[$mac,#12] 451*4882a593Smuzhiyun#else 452*4882a593Smuzhiyun strb $h0,[$mac,#0] 453*4882a593Smuzhiyun mov $h0,$h0,lsr#8 454*4882a593Smuzhiyun strb $h1,[$mac,#4] 455*4882a593Smuzhiyun mov $h1,$h1,lsr#8 456*4882a593Smuzhiyun strb $h2,[$mac,#8] 457*4882a593Smuzhiyun mov $h2,$h2,lsr#8 458*4882a593Smuzhiyun strb $h3,[$mac,#12] 459*4882a593Smuzhiyun mov $h3,$h3,lsr#8 460*4882a593Smuzhiyun 461*4882a593Smuzhiyun strb $h0,[$mac,#1] 462*4882a593Smuzhiyun mov $h0,$h0,lsr#8 463*4882a593Smuzhiyun strb $h1,[$mac,#5] 464*4882a593Smuzhiyun mov $h1,$h1,lsr#8 465*4882a593Smuzhiyun strb $h2,[$mac,#9] 466*4882a593Smuzhiyun mov $h2,$h2,lsr#8 467*4882a593Smuzhiyun strb $h3,[$mac,#13] 468*4882a593Smuzhiyun mov $h3,$h3,lsr#8 469*4882a593Smuzhiyun 470*4882a593Smuzhiyun strb $h0,[$mac,#2] 471*4882a593Smuzhiyun mov $h0,$h0,lsr#8 472*4882a593Smuzhiyun strb $h1,[$mac,#6] 473*4882a593Smuzhiyun mov $h1,$h1,lsr#8 474*4882a593Smuzhiyun strb $h2,[$mac,#10] 475*4882a593Smuzhiyun mov $h2,$h2,lsr#8 476*4882a593Smuzhiyun strb $h3,[$mac,#14] 477*4882a593Smuzhiyun mov $h3,$h3,lsr#8 478*4882a593Smuzhiyun 479*4882a593Smuzhiyun strb $h0,[$mac,#3] 480*4882a593Smuzhiyun strb $h1,[$mac,#7] 481*4882a593Smuzhiyun strb $h2,[$mac,#11] 482*4882a593Smuzhiyun strb $h3,[$mac,#15] 483*4882a593Smuzhiyun#endif 484*4882a593Smuzhiyun ldmia sp!,{r4-r11} 485*4882a593Smuzhiyun#if __ARM_ARCH__>=5 486*4882a593Smuzhiyun ret @ bx lr 487*4882a593Smuzhiyun#else 488*4882a593Smuzhiyun tst lr,#1 489*4882a593Smuzhiyun moveq pc,lr @ be binary compatible with V4, yet 490*4882a593Smuzhiyun bx lr @ interoperable with Thumb ISA:-) 491*4882a593Smuzhiyun#endif 492*4882a593Smuzhiyun.size poly1305_emit,.-poly1305_emit 493*4882a593Smuzhiyun___ 494*4882a593Smuzhiyun{ 495*4882a593Smuzhiyunmy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); 496*4882a593Smuzhiyunmy ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); 497*4882a593Smuzhiyunmy ($T0,$T1,$MASK) = map("q$_",(15,4,0)); 498*4882a593Smuzhiyun 499*4882a593Smuzhiyunmy ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); 500*4882a593Smuzhiyun 501*4882a593Smuzhiyun$code.=<<___; 502*4882a593Smuzhiyun#if __ARM_MAX_ARCH__>=7 503*4882a593Smuzhiyun.fpu neon 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun.type poly1305_init_neon,%function 506*4882a593Smuzhiyun.align 5 507*4882a593Smuzhiyunpoly1305_init_neon: 508*4882a593Smuzhiyun.Lpoly1305_init_neon: 509*4882a593Smuzhiyun ldr r3,[$ctx,#48] @ first table element 510*4882a593Smuzhiyun cmp r3,#-1 @ is value impossible? 511*4882a593Smuzhiyun bne .Lno_init_neon 512*4882a593Smuzhiyun 513*4882a593Smuzhiyun ldr r4,[$ctx,#20] @ load key base 2^32 514*4882a593Smuzhiyun ldr r5,[$ctx,#24] 515*4882a593Smuzhiyun ldr r6,[$ctx,#28] 516*4882a593Smuzhiyun ldr r7,[$ctx,#32] 517*4882a593Smuzhiyun 518*4882a593Smuzhiyun and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 519*4882a593Smuzhiyun mov r3,r4,lsr#26 520*4882a593Smuzhiyun mov r4,r5,lsr#20 521*4882a593Smuzhiyun orr r3,r3,r5,lsl#6 522*4882a593Smuzhiyun mov r5,r6,lsr#14 523*4882a593Smuzhiyun orr r4,r4,r6,lsl#12 524*4882a593Smuzhiyun mov r6,r7,lsr#8 525*4882a593Smuzhiyun orr r5,r5,r7,lsl#18 526*4882a593Smuzhiyun and r3,r3,#0x03ffffff 527*4882a593Smuzhiyun and r4,r4,#0x03ffffff 528*4882a593Smuzhiyun and r5,r5,#0x03ffffff 529*4882a593Smuzhiyun 530*4882a593Smuzhiyun vdup.32 $R0,r2 @ r^1 in both lanes 531*4882a593Smuzhiyun add r2,r3,r3,lsl#2 @ *5 532*4882a593Smuzhiyun vdup.32 $R1,r3 533*4882a593Smuzhiyun add r3,r4,r4,lsl#2 534*4882a593Smuzhiyun vdup.32 $S1,r2 535*4882a593Smuzhiyun vdup.32 $R2,r4 536*4882a593Smuzhiyun add r4,r5,r5,lsl#2 537*4882a593Smuzhiyun vdup.32 $S2,r3 538*4882a593Smuzhiyun vdup.32 $R3,r5 539*4882a593Smuzhiyun add r5,r6,r6,lsl#2 540*4882a593Smuzhiyun vdup.32 $S3,r4 541*4882a593Smuzhiyun vdup.32 $R4,r6 542*4882a593Smuzhiyun vdup.32 $S4,r5 543*4882a593Smuzhiyun 544*4882a593Smuzhiyun mov $zeros,#2 @ counter 545*4882a593Smuzhiyun 546*4882a593Smuzhiyun.Lsquare_neon: 547*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 548*4882a593Smuzhiyun @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 549*4882a593Smuzhiyun @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 550*4882a593Smuzhiyun @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 551*4882a593Smuzhiyun @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 552*4882a593Smuzhiyun @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun vmull.u32 $D0,$R0,${R0}[1] 555*4882a593Smuzhiyun vmull.u32 $D1,$R1,${R0}[1] 556*4882a593Smuzhiyun vmull.u32 $D2,$R2,${R0}[1] 557*4882a593Smuzhiyun vmull.u32 $D3,$R3,${R0}[1] 558*4882a593Smuzhiyun vmull.u32 $D4,$R4,${R0}[1] 559*4882a593Smuzhiyun 560*4882a593Smuzhiyun vmlal.u32 $D0,$R4,${S1}[1] 561*4882a593Smuzhiyun vmlal.u32 $D1,$R0,${R1}[1] 562*4882a593Smuzhiyun vmlal.u32 $D2,$R1,${R1}[1] 563*4882a593Smuzhiyun vmlal.u32 $D3,$R2,${R1}[1] 564*4882a593Smuzhiyun vmlal.u32 $D4,$R3,${R1}[1] 565*4882a593Smuzhiyun 566*4882a593Smuzhiyun vmlal.u32 $D0,$R3,${S2}[1] 567*4882a593Smuzhiyun vmlal.u32 $D1,$R4,${S2}[1] 568*4882a593Smuzhiyun vmlal.u32 $D3,$R1,${R2}[1] 569*4882a593Smuzhiyun vmlal.u32 $D2,$R0,${R2}[1] 570*4882a593Smuzhiyun vmlal.u32 $D4,$R2,${R2}[1] 571*4882a593Smuzhiyun 572*4882a593Smuzhiyun vmlal.u32 $D0,$R2,${S3}[1] 573*4882a593Smuzhiyun vmlal.u32 $D3,$R0,${R3}[1] 574*4882a593Smuzhiyun vmlal.u32 $D1,$R3,${S3}[1] 575*4882a593Smuzhiyun vmlal.u32 $D2,$R4,${S3}[1] 576*4882a593Smuzhiyun vmlal.u32 $D4,$R1,${R3}[1] 577*4882a593Smuzhiyun 578*4882a593Smuzhiyun vmlal.u32 $D3,$R4,${S4}[1] 579*4882a593Smuzhiyun vmlal.u32 $D0,$R1,${S4}[1] 580*4882a593Smuzhiyun vmlal.u32 $D1,$R2,${S4}[1] 581*4882a593Smuzhiyun vmlal.u32 $D2,$R3,${S4}[1] 582*4882a593Smuzhiyun vmlal.u32 $D4,$R0,${R4}[1] 583*4882a593Smuzhiyun 584*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 585*4882a593Smuzhiyun @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 586*4882a593Smuzhiyun @ and P. Schwabe 587*4882a593Smuzhiyun @ 588*4882a593Smuzhiyun @ H0>>+H1>>+H2>>+H3>>+H4 589*4882a593Smuzhiyun @ H3>>+H4>>*5+H0>>+H1 590*4882a593Smuzhiyun @ 591*4882a593Smuzhiyun @ Trivia. 592*4882a593Smuzhiyun @ 593*4882a593Smuzhiyun @ Result of multiplication of n-bit number by m-bit number is 594*4882a593Smuzhiyun @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 595*4882a593Smuzhiyun @ m-bit number multiplied by 2^n is still n+m bits wide. 596*4882a593Smuzhiyun @ 597*4882a593Smuzhiyun @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 598*4882a593Smuzhiyun @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 599*4882a593Smuzhiyun @ one is n+1 bits wide. 600*4882a593Smuzhiyun @ 601*4882a593Smuzhiyun @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 602*4882a593Smuzhiyun @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 603*4882a593Smuzhiyun @ can be 27. However! In cases when their width exceeds 26 bits 604*4882a593Smuzhiyun @ they are limited by 2^26+2^6. This in turn means that *sum* 605*4882a593Smuzhiyun @ of the products with these values can still be viewed as sum 606*4882a593Smuzhiyun @ of 52-bit numbers as long as the amount of addends is not a 607*4882a593Smuzhiyun @ power of 2. For example, 608*4882a593Smuzhiyun @ 609*4882a593Smuzhiyun @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 610*4882a593Smuzhiyun @ 611*4882a593Smuzhiyun @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 612*4882a593Smuzhiyun @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 613*4882a593Smuzhiyun @ 8 * (2^52) or 2^55. However, the value is then multiplied by 614*4882a593Smuzhiyun @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 615*4882a593Smuzhiyun @ which is less than 32 * (2^52) or 2^57. And when processing 616*4882a593Smuzhiyun @ data we are looking at triple as many addends... 617*4882a593Smuzhiyun @ 618*4882a593Smuzhiyun @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 619*4882a593Smuzhiyun @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 620*4882a593Smuzhiyun @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 621*4882a593Smuzhiyun @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 622*4882a593Smuzhiyun @ instruction accepts 2x32-bit input and writes 2x64-bit result. 623*4882a593Smuzhiyun @ This means that result of reduction have to be compressed upon 624*4882a593Smuzhiyun @ loop wrap-around. This can be done in the process of reduction 625*4882a593Smuzhiyun @ to minimize amount of instructions [as well as amount of 626*4882a593Smuzhiyun @ 128-bit instructions, which benefits low-end processors], but 627*4882a593Smuzhiyun @ one has to watch for H2 (which is narrower than H0) and 5*H4 628*4882a593Smuzhiyun @ not being wider than 58 bits, so that result of right shift 629*4882a593Smuzhiyun @ by 26 bits fits in 32 bits. This is also useful on x86, 630*4882a593Smuzhiyun @ because it allows to use paddd in place for paddq, which 631*4882a593Smuzhiyun @ benefits Atom, where paddq is ridiculously slow. 632*4882a593Smuzhiyun 633*4882a593Smuzhiyun vshr.u64 $T0,$D3,#26 634*4882a593Smuzhiyun vmovn.i64 $D3#lo,$D3 635*4882a593Smuzhiyun vshr.u64 $T1,$D0,#26 636*4882a593Smuzhiyun vmovn.i64 $D0#lo,$D0 637*4882a593Smuzhiyun vadd.i64 $D4,$D4,$T0 @ h3 -> h4 638*4882a593Smuzhiyun vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff 639*4882a593Smuzhiyun vadd.i64 $D1,$D1,$T1 @ h0 -> h1 640*4882a593Smuzhiyun vbic.i32 $D0#lo,#0xfc000000 641*4882a593Smuzhiyun 642*4882a593Smuzhiyun vshrn.u64 $T0#lo,$D4,#26 643*4882a593Smuzhiyun vmovn.i64 $D4#lo,$D4 644*4882a593Smuzhiyun vshr.u64 $T1,$D1,#26 645*4882a593Smuzhiyun vmovn.i64 $D1#lo,$D1 646*4882a593Smuzhiyun vadd.i64 $D2,$D2,$T1 @ h1 -> h2 647*4882a593Smuzhiyun vbic.i32 $D4#lo,#0xfc000000 648*4882a593Smuzhiyun vbic.i32 $D1#lo,#0xfc000000 649*4882a593Smuzhiyun 650*4882a593Smuzhiyun vadd.i32 $D0#lo,$D0#lo,$T0#lo 651*4882a593Smuzhiyun vshl.u32 $T0#lo,$T0#lo,#2 652*4882a593Smuzhiyun vshrn.u64 $T1#lo,$D2,#26 653*4882a593Smuzhiyun vmovn.i64 $D2#lo,$D2 654*4882a593Smuzhiyun vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 655*4882a593Smuzhiyun vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 656*4882a593Smuzhiyun vbic.i32 $D2#lo,#0xfc000000 657*4882a593Smuzhiyun 658*4882a593Smuzhiyun vshr.u32 $T0#lo,$D0#lo,#26 659*4882a593Smuzhiyun vbic.i32 $D0#lo,#0xfc000000 660*4882a593Smuzhiyun vshr.u32 $T1#lo,$D3#lo,#26 661*4882a593Smuzhiyun vbic.i32 $D3#lo,#0xfc000000 662*4882a593Smuzhiyun vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 663*4882a593Smuzhiyun vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun subs $zeros,$zeros,#1 666*4882a593Smuzhiyun beq .Lsquare_break_neon 667*4882a593Smuzhiyun 668*4882a593Smuzhiyun add $tbl0,$ctx,#(48+0*9*4) 669*4882a593Smuzhiyun add $tbl1,$ctx,#(48+1*9*4) 670*4882a593Smuzhiyun 671*4882a593Smuzhiyun vtrn.32 $R0,$D0#lo @ r^2:r^1 672*4882a593Smuzhiyun vtrn.32 $R2,$D2#lo 673*4882a593Smuzhiyun vtrn.32 $R3,$D3#lo 674*4882a593Smuzhiyun vtrn.32 $R1,$D1#lo 675*4882a593Smuzhiyun vtrn.32 $R4,$D4#lo 676*4882a593Smuzhiyun 677*4882a593Smuzhiyun vshl.u32 $S2,$R2,#2 @ *5 678*4882a593Smuzhiyun vshl.u32 $S3,$R3,#2 679*4882a593Smuzhiyun vshl.u32 $S1,$R1,#2 680*4882a593Smuzhiyun vshl.u32 $S4,$R4,#2 681*4882a593Smuzhiyun vadd.i32 $S2,$S2,$R2 682*4882a593Smuzhiyun vadd.i32 $S1,$S1,$R1 683*4882a593Smuzhiyun vadd.i32 $S3,$S3,$R3 684*4882a593Smuzhiyun vadd.i32 $S4,$S4,$R4 685*4882a593Smuzhiyun 686*4882a593Smuzhiyun vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 687*4882a593Smuzhiyun vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 688*4882a593Smuzhiyun vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 689*4882a593Smuzhiyun vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 690*4882a593Smuzhiyun vst1.32 {${S4}[0]},[$tbl0,:32] 691*4882a593Smuzhiyun vst1.32 {${S4}[1]},[$tbl1,:32] 692*4882a593Smuzhiyun 693*4882a593Smuzhiyun b .Lsquare_neon 694*4882a593Smuzhiyun 695*4882a593Smuzhiyun.align 4 696*4882a593Smuzhiyun.Lsquare_break_neon: 697*4882a593Smuzhiyun add $tbl0,$ctx,#(48+2*4*9) 698*4882a593Smuzhiyun add $tbl1,$ctx,#(48+3*4*9) 699*4882a593Smuzhiyun 700*4882a593Smuzhiyun vmov $R0,$D0#lo @ r^4:r^3 701*4882a593Smuzhiyun vshl.u32 $S1,$D1#lo,#2 @ *5 702*4882a593Smuzhiyun vmov $R1,$D1#lo 703*4882a593Smuzhiyun vshl.u32 $S2,$D2#lo,#2 704*4882a593Smuzhiyun vmov $R2,$D2#lo 705*4882a593Smuzhiyun vshl.u32 $S3,$D3#lo,#2 706*4882a593Smuzhiyun vmov $R3,$D3#lo 707*4882a593Smuzhiyun vshl.u32 $S4,$D4#lo,#2 708*4882a593Smuzhiyun vmov $R4,$D4#lo 709*4882a593Smuzhiyun vadd.i32 $S1,$S1,$D1#lo 710*4882a593Smuzhiyun vadd.i32 $S2,$S2,$D2#lo 711*4882a593Smuzhiyun vadd.i32 $S3,$S3,$D3#lo 712*4882a593Smuzhiyun vadd.i32 $S4,$S4,$D4#lo 713*4882a593Smuzhiyun 714*4882a593Smuzhiyun vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 715*4882a593Smuzhiyun vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 716*4882a593Smuzhiyun vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 717*4882a593Smuzhiyun vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 718*4882a593Smuzhiyun vst1.32 {${S4}[0]},[$tbl0] 719*4882a593Smuzhiyun vst1.32 {${S4}[1]},[$tbl1] 720*4882a593Smuzhiyun 721*4882a593Smuzhiyun.Lno_init_neon: 722*4882a593Smuzhiyun ret @ bx lr 723*4882a593Smuzhiyun.size poly1305_init_neon,.-poly1305_init_neon 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun.type poly1305_blocks_neon,%function 726*4882a593Smuzhiyun.align 5 727*4882a593Smuzhiyunpoly1305_blocks_neon: 728*4882a593Smuzhiyun.Lpoly1305_blocks_neon: 729*4882a593Smuzhiyun ldr ip,[$ctx,#36] @ is_base2_26 730*4882a593Smuzhiyun 731*4882a593Smuzhiyun cmp $len,#64 732*4882a593Smuzhiyun blo .Lpoly1305_blocks 733*4882a593Smuzhiyun 734*4882a593Smuzhiyun stmdb sp!,{r4-r7} 735*4882a593Smuzhiyun vstmdb sp!,{d8-d15} @ ABI specification says so 736*4882a593Smuzhiyun 737*4882a593Smuzhiyun tst ip,ip @ is_base2_26? 738*4882a593Smuzhiyun bne .Lbase2_26_neon 739*4882a593Smuzhiyun 740*4882a593Smuzhiyun stmdb sp!,{r1-r3,lr} 741*4882a593Smuzhiyun bl .Lpoly1305_init_neon 742*4882a593Smuzhiyun 743*4882a593Smuzhiyun ldr r4,[$ctx,#0] @ load hash value base 2^32 744*4882a593Smuzhiyun ldr r5,[$ctx,#4] 745*4882a593Smuzhiyun ldr r6,[$ctx,#8] 746*4882a593Smuzhiyun ldr r7,[$ctx,#12] 747*4882a593Smuzhiyun ldr ip,[$ctx,#16] 748*4882a593Smuzhiyun 749*4882a593Smuzhiyun and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 750*4882a593Smuzhiyun mov r3,r4,lsr#26 751*4882a593Smuzhiyun veor $D0#lo,$D0#lo,$D0#lo 752*4882a593Smuzhiyun mov r4,r5,lsr#20 753*4882a593Smuzhiyun orr r3,r3,r5,lsl#6 754*4882a593Smuzhiyun veor $D1#lo,$D1#lo,$D1#lo 755*4882a593Smuzhiyun mov r5,r6,lsr#14 756*4882a593Smuzhiyun orr r4,r4,r6,lsl#12 757*4882a593Smuzhiyun veor $D2#lo,$D2#lo,$D2#lo 758*4882a593Smuzhiyun mov r6,r7,lsr#8 759*4882a593Smuzhiyun orr r5,r5,r7,lsl#18 760*4882a593Smuzhiyun veor $D3#lo,$D3#lo,$D3#lo 761*4882a593Smuzhiyun and r3,r3,#0x03ffffff 762*4882a593Smuzhiyun orr r6,r6,ip,lsl#24 763*4882a593Smuzhiyun veor $D4#lo,$D4#lo,$D4#lo 764*4882a593Smuzhiyun and r4,r4,#0x03ffffff 765*4882a593Smuzhiyun mov r1,#1 766*4882a593Smuzhiyun and r5,r5,#0x03ffffff 767*4882a593Smuzhiyun str r1,[$ctx,#36] @ set is_base2_26 768*4882a593Smuzhiyun 769*4882a593Smuzhiyun vmov.32 $D0#lo[0],r2 770*4882a593Smuzhiyun vmov.32 $D1#lo[0],r3 771*4882a593Smuzhiyun vmov.32 $D2#lo[0],r4 772*4882a593Smuzhiyun vmov.32 $D3#lo[0],r5 773*4882a593Smuzhiyun vmov.32 $D4#lo[0],r6 774*4882a593Smuzhiyun adr $zeros,.Lzeros 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun ldmia sp!,{r1-r3,lr} 777*4882a593Smuzhiyun b .Lhash_loaded 778*4882a593Smuzhiyun 779*4882a593Smuzhiyun.align 4 780*4882a593Smuzhiyun.Lbase2_26_neon: 781*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 782*4882a593Smuzhiyun @ load hash value 783*4882a593Smuzhiyun 784*4882a593Smuzhiyun veor $D0#lo,$D0#lo,$D0#lo 785*4882a593Smuzhiyun veor $D1#lo,$D1#lo,$D1#lo 786*4882a593Smuzhiyun veor $D2#lo,$D2#lo,$D2#lo 787*4882a593Smuzhiyun veor $D3#lo,$D3#lo,$D3#lo 788*4882a593Smuzhiyun veor $D4#lo,$D4#lo,$D4#lo 789*4882a593Smuzhiyun vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 790*4882a593Smuzhiyun adr $zeros,.Lzeros 791*4882a593Smuzhiyun vld1.32 {$D4#lo[0]},[$ctx] 792*4882a593Smuzhiyun sub $ctx,$ctx,#16 @ rewind 793*4882a593Smuzhiyun 794*4882a593Smuzhiyun.Lhash_loaded: 795*4882a593Smuzhiyun add $in2,$inp,#32 796*4882a593Smuzhiyun mov $padbit,$padbit,lsl#24 797*4882a593Smuzhiyun tst $len,#31 798*4882a593Smuzhiyun beq .Leven 799*4882a593Smuzhiyun 800*4882a593Smuzhiyun vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! 801*4882a593Smuzhiyun vmov.32 $H4#lo[0],$padbit 802*4882a593Smuzhiyun sub $len,$len,#16 803*4882a593Smuzhiyun add $in2,$inp,#32 804*4882a593Smuzhiyun 805*4882a593Smuzhiyun# ifdef __ARMEB__ 806*4882a593Smuzhiyun vrev32.8 $H0,$H0 807*4882a593Smuzhiyun vrev32.8 $H3,$H3 808*4882a593Smuzhiyun vrev32.8 $H1,$H1 809*4882a593Smuzhiyun vrev32.8 $H2,$H2 810*4882a593Smuzhiyun# endif 811*4882a593Smuzhiyun vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 812*4882a593Smuzhiyun vshl.u32 $H3#lo,$H3#lo,#18 813*4882a593Smuzhiyun 814*4882a593Smuzhiyun vsri.u32 $H3#lo,$H2#lo,#14 815*4882a593Smuzhiyun vshl.u32 $H2#lo,$H2#lo,#12 816*4882a593Smuzhiyun vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi 817*4882a593Smuzhiyun 818*4882a593Smuzhiyun vbic.i32 $H3#lo,#0xfc000000 819*4882a593Smuzhiyun vsri.u32 $H2#lo,$H1#lo,#20 820*4882a593Smuzhiyun vshl.u32 $H1#lo,$H1#lo,#6 821*4882a593Smuzhiyun 822*4882a593Smuzhiyun vbic.i32 $H2#lo,#0xfc000000 823*4882a593Smuzhiyun vsri.u32 $H1#lo,$H0#lo,#26 824*4882a593Smuzhiyun vadd.i32 $H3#hi,$H3#lo,$D3#lo 825*4882a593Smuzhiyun 826*4882a593Smuzhiyun vbic.i32 $H0#lo,#0xfc000000 827*4882a593Smuzhiyun vbic.i32 $H1#lo,#0xfc000000 828*4882a593Smuzhiyun vadd.i32 $H2#hi,$H2#lo,$D2#lo 829*4882a593Smuzhiyun 830*4882a593Smuzhiyun vadd.i32 $H0#hi,$H0#lo,$D0#lo 831*4882a593Smuzhiyun vadd.i32 $H1#hi,$H1#lo,$D1#lo 832*4882a593Smuzhiyun 833*4882a593Smuzhiyun mov $tbl1,$zeros 834*4882a593Smuzhiyun add $tbl0,$ctx,#48 835*4882a593Smuzhiyun 836*4882a593Smuzhiyun cmp $len,$len 837*4882a593Smuzhiyun b .Long_tail 838*4882a593Smuzhiyun 839*4882a593Smuzhiyun.align 4 840*4882a593Smuzhiyun.Leven: 841*4882a593Smuzhiyun subs $len,$len,#64 842*4882a593Smuzhiyun it lo 843*4882a593Smuzhiyun movlo $in2,$zeros 844*4882a593Smuzhiyun 845*4882a593Smuzhiyun vmov.i32 $H4,#1<<24 @ padbit, yes, always 846*4882a593Smuzhiyun vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 847*4882a593Smuzhiyun add $inp,$inp,#64 848*4882a593Smuzhiyun vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 849*4882a593Smuzhiyun add $in2,$in2,#64 850*4882a593Smuzhiyun itt hi 851*4882a593Smuzhiyun addhi $tbl1,$ctx,#(48+1*9*4) 852*4882a593Smuzhiyun addhi $tbl0,$ctx,#(48+3*9*4) 853*4882a593Smuzhiyun 854*4882a593Smuzhiyun# ifdef __ARMEB__ 855*4882a593Smuzhiyun vrev32.8 $H0,$H0 856*4882a593Smuzhiyun vrev32.8 $H3,$H3 857*4882a593Smuzhiyun vrev32.8 $H1,$H1 858*4882a593Smuzhiyun vrev32.8 $H2,$H2 859*4882a593Smuzhiyun# endif 860*4882a593Smuzhiyun vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 861*4882a593Smuzhiyun vshl.u32 $H3,$H3,#18 862*4882a593Smuzhiyun 863*4882a593Smuzhiyun vsri.u32 $H3,$H2,#14 864*4882a593Smuzhiyun vshl.u32 $H2,$H2,#12 865*4882a593Smuzhiyun 866*4882a593Smuzhiyun vbic.i32 $H3,#0xfc000000 867*4882a593Smuzhiyun vsri.u32 $H2,$H1,#20 868*4882a593Smuzhiyun vshl.u32 $H1,$H1,#6 869*4882a593Smuzhiyun 870*4882a593Smuzhiyun vbic.i32 $H2,#0xfc000000 871*4882a593Smuzhiyun vsri.u32 $H1,$H0,#26 872*4882a593Smuzhiyun 873*4882a593Smuzhiyun vbic.i32 $H0,#0xfc000000 874*4882a593Smuzhiyun vbic.i32 $H1,#0xfc000000 875*4882a593Smuzhiyun 876*4882a593Smuzhiyun bls .Lskip_loop 877*4882a593Smuzhiyun 878*4882a593Smuzhiyun vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 879*4882a593Smuzhiyun vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 880*4882a593Smuzhiyun vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 881*4882a593Smuzhiyun vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 882*4882a593Smuzhiyun b .Loop_neon 883*4882a593Smuzhiyun 884*4882a593Smuzhiyun.align 5 885*4882a593Smuzhiyun.Loop_neon: 886*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 887*4882a593Smuzhiyun @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 888*4882a593Smuzhiyun @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 889*4882a593Smuzhiyun @ \___________________/ 890*4882a593Smuzhiyun @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 891*4882a593Smuzhiyun @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 892*4882a593Smuzhiyun @ \___________________/ \____________________/ 893*4882a593Smuzhiyun @ 894*4882a593Smuzhiyun @ Note that we start with inp[2:3]*r^2. This is because it 895*4882a593Smuzhiyun @ doesn't depend on reduction in previous iteration. 896*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 897*4882a593Smuzhiyun @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 898*4882a593Smuzhiyun @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 899*4882a593Smuzhiyun @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 900*4882a593Smuzhiyun @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 901*4882a593Smuzhiyun @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 902*4882a593Smuzhiyun 903*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 904*4882a593Smuzhiyun @ inp[2:3]*r^2 905*4882a593Smuzhiyun 906*4882a593Smuzhiyun vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] 907*4882a593Smuzhiyun vmull.u32 $D2,$H2#hi,${R0}[1] 908*4882a593Smuzhiyun vadd.i32 $H0#lo,$H0#lo,$D0#lo 909*4882a593Smuzhiyun vmull.u32 $D0,$H0#hi,${R0}[1] 910*4882a593Smuzhiyun vadd.i32 $H3#lo,$H3#lo,$D3#lo 911*4882a593Smuzhiyun vmull.u32 $D3,$H3#hi,${R0}[1] 912*4882a593Smuzhiyun vmlal.u32 $D2,$H1#hi,${R1}[1] 913*4882a593Smuzhiyun vadd.i32 $H1#lo,$H1#lo,$D1#lo 914*4882a593Smuzhiyun vmull.u32 $D1,$H1#hi,${R0}[1] 915*4882a593Smuzhiyun 916*4882a593Smuzhiyun vadd.i32 $H4#lo,$H4#lo,$D4#lo 917*4882a593Smuzhiyun vmull.u32 $D4,$H4#hi,${R0}[1] 918*4882a593Smuzhiyun subs $len,$len,#64 919*4882a593Smuzhiyun vmlal.u32 $D0,$H4#hi,${S1}[1] 920*4882a593Smuzhiyun it lo 921*4882a593Smuzhiyun movlo $in2,$zeros 922*4882a593Smuzhiyun vmlal.u32 $D3,$H2#hi,${R1}[1] 923*4882a593Smuzhiyun vld1.32 ${S4}[1],[$tbl1,:32] 924*4882a593Smuzhiyun vmlal.u32 $D1,$H0#hi,${R1}[1] 925*4882a593Smuzhiyun vmlal.u32 $D4,$H3#hi,${R1}[1] 926*4882a593Smuzhiyun 927*4882a593Smuzhiyun vmlal.u32 $D0,$H3#hi,${S2}[1] 928*4882a593Smuzhiyun vmlal.u32 $D3,$H1#hi,${R2}[1] 929*4882a593Smuzhiyun vmlal.u32 $D4,$H2#hi,${R2}[1] 930*4882a593Smuzhiyun vmlal.u32 $D1,$H4#hi,${S2}[1] 931*4882a593Smuzhiyun vmlal.u32 $D2,$H0#hi,${R2}[1] 932*4882a593Smuzhiyun 933*4882a593Smuzhiyun vmlal.u32 $D3,$H0#hi,${R3}[1] 934*4882a593Smuzhiyun vmlal.u32 $D0,$H2#hi,${S3}[1] 935*4882a593Smuzhiyun vmlal.u32 $D4,$H1#hi,${R3}[1] 936*4882a593Smuzhiyun vmlal.u32 $D1,$H3#hi,${S3}[1] 937*4882a593Smuzhiyun vmlal.u32 $D2,$H4#hi,${S3}[1] 938*4882a593Smuzhiyun 939*4882a593Smuzhiyun vmlal.u32 $D3,$H4#hi,${S4}[1] 940*4882a593Smuzhiyun vmlal.u32 $D0,$H1#hi,${S4}[1] 941*4882a593Smuzhiyun vmlal.u32 $D4,$H0#hi,${R4}[1] 942*4882a593Smuzhiyun vmlal.u32 $D1,$H2#hi,${S4}[1] 943*4882a593Smuzhiyun vmlal.u32 $D2,$H3#hi,${S4}[1] 944*4882a593Smuzhiyun 945*4882a593Smuzhiyun vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 946*4882a593Smuzhiyun add $in2,$in2,#64 947*4882a593Smuzhiyun 948*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 949*4882a593Smuzhiyun @ (hash+inp[0:1])*r^4 and accumulate 950*4882a593Smuzhiyun 951*4882a593Smuzhiyun vmlal.u32 $D3,$H3#lo,${R0}[0] 952*4882a593Smuzhiyun vmlal.u32 $D0,$H0#lo,${R0}[0] 953*4882a593Smuzhiyun vmlal.u32 $D4,$H4#lo,${R0}[0] 954*4882a593Smuzhiyun vmlal.u32 $D1,$H1#lo,${R0}[0] 955*4882a593Smuzhiyun vmlal.u32 $D2,$H2#lo,${R0}[0] 956*4882a593Smuzhiyun vld1.32 ${S4}[0],[$tbl0,:32] 957*4882a593Smuzhiyun 958*4882a593Smuzhiyun vmlal.u32 $D3,$H2#lo,${R1}[0] 959*4882a593Smuzhiyun vmlal.u32 $D0,$H4#lo,${S1}[0] 960*4882a593Smuzhiyun vmlal.u32 $D4,$H3#lo,${R1}[0] 961*4882a593Smuzhiyun vmlal.u32 $D1,$H0#lo,${R1}[0] 962*4882a593Smuzhiyun vmlal.u32 $D2,$H1#lo,${R1}[0] 963*4882a593Smuzhiyun 964*4882a593Smuzhiyun vmlal.u32 $D3,$H1#lo,${R2}[0] 965*4882a593Smuzhiyun vmlal.u32 $D0,$H3#lo,${S2}[0] 966*4882a593Smuzhiyun vmlal.u32 $D4,$H2#lo,${R2}[0] 967*4882a593Smuzhiyun vmlal.u32 $D1,$H4#lo,${S2}[0] 968*4882a593Smuzhiyun vmlal.u32 $D2,$H0#lo,${R2}[0] 969*4882a593Smuzhiyun 970*4882a593Smuzhiyun vmlal.u32 $D3,$H0#lo,${R3}[0] 971*4882a593Smuzhiyun vmlal.u32 $D0,$H2#lo,${S3}[0] 972*4882a593Smuzhiyun vmlal.u32 $D4,$H1#lo,${R3}[0] 973*4882a593Smuzhiyun vmlal.u32 $D1,$H3#lo,${S3}[0] 974*4882a593Smuzhiyun vmlal.u32 $D3,$H4#lo,${S4}[0] 975*4882a593Smuzhiyun 976*4882a593Smuzhiyun vmlal.u32 $D2,$H4#lo,${S3}[0] 977*4882a593Smuzhiyun vmlal.u32 $D0,$H1#lo,${S4}[0] 978*4882a593Smuzhiyun vmlal.u32 $D4,$H0#lo,${R4}[0] 979*4882a593Smuzhiyun vmov.i32 $H4,#1<<24 @ padbit, yes, always 980*4882a593Smuzhiyun vmlal.u32 $D1,$H2#lo,${S4}[0] 981*4882a593Smuzhiyun vmlal.u32 $D2,$H3#lo,${S4}[0] 982*4882a593Smuzhiyun 983*4882a593Smuzhiyun vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 984*4882a593Smuzhiyun add $inp,$inp,#64 985*4882a593Smuzhiyun# ifdef __ARMEB__ 986*4882a593Smuzhiyun vrev32.8 $H0,$H0 987*4882a593Smuzhiyun vrev32.8 $H1,$H1 988*4882a593Smuzhiyun vrev32.8 $H2,$H2 989*4882a593Smuzhiyun vrev32.8 $H3,$H3 990*4882a593Smuzhiyun# endif 991*4882a593Smuzhiyun 992*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 993*4882a593Smuzhiyun @ lazy reduction interleaved with base 2^32 -> base 2^26 of 994*4882a593Smuzhiyun @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. 995*4882a593Smuzhiyun 996*4882a593Smuzhiyun vshr.u64 $T0,$D3,#26 997*4882a593Smuzhiyun vmovn.i64 $D3#lo,$D3 998*4882a593Smuzhiyun vshr.u64 $T1,$D0,#26 999*4882a593Smuzhiyun vmovn.i64 $D0#lo,$D0 1000*4882a593Smuzhiyun vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1001*4882a593Smuzhiyun vbic.i32 $D3#lo,#0xfc000000 1002*4882a593Smuzhiyun vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 1003*4882a593Smuzhiyun vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1004*4882a593Smuzhiyun vshl.u32 $H3,$H3,#18 1005*4882a593Smuzhiyun vbic.i32 $D0#lo,#0xfc000000 1006*4882a593Smuzhiyun 1007*4882a593Smuzhiyun vshrn.u64 $T0#lo,$D4,#26 1008*4882a593Smuzhiyun vmovn.i64 $D4#lo,$D4 1009*4882a593Smuzhiyun vshr.u64 $T1,$D1,#26 1010*4882a593Smuzhiyun vmovn.i64 $D1#lo,$D1 1011*4882a593Smuzhiyun vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1012*4882a593Smuzhiyun vsri.u32 $H3,$H2,#14 1013*4882a593Smuzhiyun vbic.i32 $D4#lo,#0xfc000000 1014*4882a593Smuzhiyun vshl.u32 $H2,$H2,#12 1015*4882a593Smuzhiyun vbic.i32 $D1#lo,#0xfc000000 1016*4882a593Smuzhiyun 1017*4882a593Smuzhiyun vadd.i32 $D0#lo,$D0#lo,$T0#lo 1018*4882a593Smuzhiyun vshl.u32 $T0#lo,$T0#lo,#2 1019*4882a593Smuzhiyun vbic.i32 $H3,#0xfc000000 1020*4882a593Smuzhiyun vshrn.u64 $T1#lo,$D2,#26 1021*4882a593Smuzhiyun vmovn.i64 $D2#lo,$D2 1022*4882a593Smuzhiyun vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] 1023*4882a593Smuzhiyun vsri.u32 $H2,$H1,#20 1024*4882a593Smuzhiyun vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 1025*4882a593Smuzhiyun vshl.u32 $H1,$H1,#6 1026*4882a593Smuzhiyun vbic.i32 $D2#lo,#0xfc000000 1027*4882a593Smuzhiyun vbic.i32 $H2,#0xfc000000 1028*4882a593Smuzhiyun 1029*4882a593Smuzhiyun vshrn.u64 $T0#lo,$D0,#26 @ re-narrow 1030*4882a593Smuzhiyun vmovn.i64 $D0#lo,$D0 1031*4882a593Smuzhiyun vsri.u32 $H1,$H0,#26 1032*4882a593Smuzhiyun vbic.i32 $H0,#0xfc000000 1033*4882a593Smuzhiyun vshr.u32 $T1#lo,$D3#lo,#26 1034*4882a593Smuzhiyun vbic.i32 $D3#lo,#0xfc000000 1035*4882a593Smuzhiyun vbic.i32 $D0#lo,#0xfc000000 1036*4882a593Smuzhiyun vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 1037*4882a593Smuzhiyun vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 1038*4882a593Smuzhiyun vbic.i32 $H1,#0xfc000000 1039*4882a593Smuzhiyun 1040*4882a593Smuzhiyun bhi .Loop_neon 1041*4882a593Smuzhiyun 1042*4882a593Smuzhiyun.Lskip_loop: 1043*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1044*4882a593Smuzhiyun @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1045*4882a593Smuzhiyun 1046*4882a593Smuzhiyun add $tbl1,$ctx,#(48+0*9*4) 1047*4882a593Smuzhiyun add $tbl0,$ctx,#(48+1*9*4) 1048*4882a593Smuzhiyun adds $len,$len,#32 1049*4882a593Smuzhiyun it ne 1050*4882a593Smuzhiyun movne $len,#0 1051*4882a593Smuzhiyun bne .Long_tail 1052*4882a593Smuzhiyun 1053*4882a593Smuzhiyun vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi 1054*4882a593Smuzhiyun vadd.i32 $H0#hi,$H0#lo,$D0#lo 1055*4882a593Smuzhiyun vadd.i32 $H3#hi,$H3#lo,$D3#lo 1056*4882a593Smuzhiyun vadd.i32 $H1#hi,$H1#lo,$D1#lo 1057*4882a593Smuzhiyun vadd.i32 $H4#hi,$H4#lo,$D4#lo 1058*4882a593Smuzhiyun 1059*4882a593Smuzhiyun.Long_tail: 1060*4882a593Smuzhiyun vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 1061*4882a593Smuzhiyun vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 1062*4882a593Smuzhiyun 1063*4882a593Smuzhiyun vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant 1064*4882a593Smuzhiyun vmull.u32 $D2,$H2#hi,$R0 1065*4882a593Smuzhiyun vadd.i32 $H0#lo,$H0#lo,$D0#lo 1066*4882a593Smuzhiyun vmull.u32 $D0,$H0#hi,$R0 1067*4882a593Smuzhiyun vadd.i32 $H3#lo,$H3#lo,$D3#lo 1068*4882a593Smuzhiyun vmull.u32 $D3,$H3#hi,$R0 1069*4882a593Smuzhiyun vadd.i32 $H1#lo,$H1#lo,$D1#lo 1070*4882a593Smuzhiyun vmull.u32 $D1,$H1#hi,$R0 1071*4882a593Smuzhiyun vadd.i32 $H4#lo,$H4#lo,$D4#lo 1072*4882a593Smuzhiyun vmull.u32 $D4,$H4#hi,$R0 1073*4882a593Smuzhiyun 1074*4882a593Smuzhiyun vmlal.u32 $D0,$H4#hi,$S1 1075*4882a593Smuzhiyun vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1076*4882a593Smuzhiyun vmlal.u32 $D3,$H2#hi,$R1 1077*4882a593Smuzhiyun vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1078*4882a593Smuzhiyun vmlal.u32 $D1,$H0#hi,$R1 1079*4882a593Smuzhiyun vmlal.u32 $D4,$H3#hi,$R1 1080*4882a593Smuzhiyun vmlal.u32 $D2,$H1#hi,$R1 1081*4882a593Smuzhiyun 1082*4882a593Smuzhiyun vmlal.u32 $D3,$H1#hi,$R2 1083*4882a593Smuzhiyun vld1.32 ${S4}[1],[$tbl1,:32] 1084*4882a593Smuzhiyun vmlal.u32 $D0,$H3#hi,$S2 1085*4882a593Smuzhiyun vld1.32 ${S4}[0],[$tbl0,:32] 1086*4882a593Smuzhiyun vmlal.u32 $D4,$H2#hi,$R2 1087*4882a593Smuzhiyun vmlal.u32 $D1,$H4#hi,$S2 1088*4882a593Smuzhiyun vmlal.u32 $D2,$H0#hi,$R2 1089*4882a593Smuzhiyun 1090*4882a593Smuzhiyun vmlal.u32 $D3,$H0#hi,$R3 1091*4882a593Smuzhiyun it ne 1092*4882a593Smuzhiyun addne $tbl1,$ctx,#(48+2*9*4) 1093*4882a593Smuzhiyun vmlal.u32 $D0,$H2#hi,$S3 1094*4882a593Smuzhiyun it ne 1095*4882a593Smuzhiyun addne $tbl0,$ctx,#(48+3*9*4) 1096*4882a593Smuzhiyun vmlal.u32 $D4,$H1#hi,$R3 1097*4882a593Smuzhiyun vmlal.u32 $D1,$H3#hi,$S3 1098*4882a593Smuzhiyun vmlal.u32 $D2,$H4#hi,$S3 1099*4882a593Smuzhiyun 1100*4882a593Smuzhiyun vmlal.u32 $D3,$H4#hi,$S4 1101*4882a593Smuzhiyun vorn $MASK,$MASK,$MASK @ all-ones, can be redundant 1102*4882a593Smuzhiyun vmlal.u32 $D0,$H1#hi,$S4 1103*4882a593Smuzhiyun vshr.u64 $MASK,$MASK,#38 1104*4882a593Smuzhiyun vmlal.u32 $D4,$H0#hi,$R4 1105*4882a593Smuzhiyun vmlal.u32 $D1,$H2#hi,$S4 1106*4882a593Smuzhiyun vmlal.u32 $D2,$H3#hi,$S4 1107*4882a593Smuzhiyun 1108*4882a593Smuzhiyun beq .Lshort_tail 1109*4882a593Smuzhiyun 1110*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1111*4882a593Smuzhiyun @ (hash+inp[0:1])*r^4:r^3 and accumulate 1112*4882a593Smuzhiyun 1113*4882a593Smuzhiyun vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 1114*4882a593Smuzhiyun vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 1115*4882a593Smuzhiyun 1116*4882a593Smuzhiyun vmlal.u32 $D2,$H2#lo,$R0 1117*4882a593Smuzhiyun vmlal.u32 $D0,$H0#lo,$R0 1118*4882a593Smuzhiyun vmlal.u32 $D3,$H3#lo,$R0 1119*4882a593Smuzhiyun vmlal.u32 $D1,$H1#lo,$R0 1120*4882a593Smuzhiyun vmlal.u32 $D4,$H4#lo,$R0 1121*4882a593Smuzhiyun 1122*4882a593Smuzhiyun vmlal.u32 $D0,$H4#lo,$S1 1123*4882a593Smuzhiyun vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1124*4882a593Smuzhiyun vmlal.u32 $D3,$H2#lo,$R1 1125*4882a593Smuzhiyun vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1126*4882a593Smuzhiyun vmlal.u32 $D1,$H0#lo,$R1 1127*4882a593Smuzhiyun vmlal.u32 $D4,$H3#lo,$R1 1128*4882a593Smuzhiyun vmlal.u32 $D2,$H1#lo,$R1 1129*4882a593Smuzhiyun 1130*4882a593Smuzhiyun vmlal.u32 $D3,$H1#lo,$R2 1131*4882a593Smuzhiyun vld1.32 ${S4}[1],[$tbl1,:32] 1132*4882a593Smuzhiyun vmlal.u32 $D0,$H3#lo,$S2 1133*4882a593Smuzhiyun vld1.32 ${S4}[0],[$tbl0,:32] 1134*4882a593Smuzhiyun vmlal.u32 $D4,$H2#lo,$R2 1135*4882a593Smuzhiyun vmlal.u32 $D1,$H4#lo,$S2 1136*4882a593Smuzhiyun vmlal.u32 $D2,$H0#lo,$R2 1137*4882a593Smuzhiyun 1138*4882a593Smuzhiyun vmlal.u32 $D3,$H0#lo,$R3 1139*4882a593Smuzhiyun vmlal.u32 $D0,$H2#lo,$S3 1140*4882a593Smuzhiyun vmlal.u32 $D4,$H1#lo,$R3 1141*4882a593Smuzhiyun vmlal.u32 $D1,$H3#lo,$S3 1142*4882a593Smuzhiyun vmlal.u32 $D2,$H4#lo,$S3 1143*4882a593Smuzhiyun 1144*4882a593Smuzhiyun vmlal.u32 $D3,$H4#lo,$S4 1145*4882a593Smuzhiyun vorn $MASK,$MASK,$MASK @ all-ones 1146*4882a593Smuzhiyun vmlal.u32 $D0,$H1#lo,$S4 1147*4882a593Smuzhiyun vshr.u64 $MASK,$MASK,#38 1148*4882a593Smuzhiyun vmlal.u32 $D4,$H0#lo,$R4 1149*4882a593Smuzhiyun vmlal.u32 $D1,$H2#lo,$S4 1150*4882a593Smuzhiyun vmlal.u32 $D2,$H3#lo,$S4 1151*4882a593Smuzhiyun 1152*4882a593Smuzhiyun.Lshort_tail: 1153*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1154*4882a593Smuzhiyun @ horizontal addition 1155*4882a593Smuzhiyun 1156*4882a593Smuzhiyun vadd.i64 $D3#lo,$D3#lo,$D3#hi 1157*4882a593Smuzhiyun vadd.i64 $D0#lo,$D0#lo,$D0#hi 1158*4882a593Smuzhiyun vadd.i64 $D4#lo,$D4#lo,$D4#hi 1159*4882a593Smuzhiyun vadd.i64 $D1#lo,$D1#lo,$D1#hi 1160*4882a593Smuzhiyun vadd.i64 $D2#lo,$D2#lo,$D2#hi 1161*4882a593Smuzhiyun 1162*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1163*4882a593Smuzhiyun @ lazy reduction, but without narrowing 1164*4882a593Smuzhiyun 1165*4882a593Smuzhiyun vshr.u64 $T0,$D3,#26 1166*4882a593Smuzhiyun vand.i64 $D3,$D3,$MASK 1167*4882a593Smuzhiyun vshr.u64 $T1,$D0,#26 1168*4882a593Smuzhiyun vand.i64 $D0,$D0,$MASK 1169*4882a593Smuzhiyun vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1170*4882a593Smuzhiyun vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1171*4882a593Smuzhiyun 1172*4882a593Smuzhiyun vshr.u64 $T0,$D4,#26 1173*4882a593Smuzhiyun vand.i64 $D4,$D4,$MASK 1174*4882a593Smuzhiyun vshr.u64 $T1,$D1,#26 1175*4882a593Smuzhiyun vand.i64 $D1,$D1,$MASK 1176*4882a593Smuzhiyun vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1177*4882a593Smuzhiyun 1178*4882a593Smuzhiyun vadd.i64 $D0,$D0,$T0 1179*4882a593Smuzhiyun vshl.u64 $T0,$T0,#2 1180*4882a593Smuzhiyun vshr.u64 $T1,$D2,#26 1181*4882a593Smuzhiyun vand.i64 $D2,$D2,$MASK 1182*4882a593Smuzhiyun vadd.i64 $D0,$D0,$T0 @ h4 -> h0 1183*4882a593Smuzhiyun vadd.i64 $D3,$D3,$T1 @ h2 -> h3 1184*4882a593Smuzhiyun 1185*4882a593Smuzhiyun vshr.u64 $T0,$D0,#26 1186*4882a593Smuzhiyun vand.i64 $D0,$D0,$MASK 1187*4882a593Smuzhiyun vshr.u64 $T1,$D3,#26 1188*4882a593Smuzhiyun vand.i64 $D3,$D3,$MASK 1189*4882a593Smuzhiyun vadd.i64 $D1,$D1,$T0 @ h0 -> h1 1190*4882a593Smuzhiyun vadd.i64 $D4,$D4,$T1 @ h3 -> h4 1191*4882a593Smuzhiyun 1192*4882a593Smuzhiyun cmp $len,#0 1193*4882a593Smuzhiyun bne .Leven 1194*4882a593Smuzhiyun 1195*4882a593Smuzhiyun @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1196*4882a593Smuzhiyun @ store hash value 1197*4882a593Smuzhiyun 1198*4882a593Smuzhiyun vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 1199*4882a593Smuzhiyun vst1.32 {$D4#lo[0]},[$ctx] 1200*4882a593Smuzhiyun 1201*4882a593Smuzhiyun vldmia sp!,{d8-d15} @ epilogue 1202*4882a593Smuzhiyun ldmia sp!,{r4-r7} 1203*4882a593Smuzhiyun ret @ bx lr 1204*4882a593Smuzhiyun.size poly1305_blocks_neon,.-poly1305_blocks_neon 1205*4882a593Smuzhiyun 1206*4882a593Smuzhiyun.align 5 1207*4882a593Smuzhiyun.Lzeros: 1208*4882a593Smuzhiyun.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1209*4882a593Smuzhiyun#ifndef __KERNEL__ 1210*4882a593Smuzhiyun.LOPENSSL_armcap: 1211*4882a593Smuzhiyun# ifdef _WIN32 1212*4882a593Smuzhiyun.word OPENSSL_armcap_P 1213*4882a593Smuzhiyun# else 1214*4882a593Smuzhiyun.word OPENSSL_armcap_P-.Lpoly1305_init 1215*4882a593Smuzhiyun# endif 1216*4882a593Smuzhiyun.comm OPENSSL_armcap_P,4,4 1217*4882a593Smuzhiyun.hidden OPENSSL_armcap_P 1218*4882a593Smuzhiyun#endif 1219*4882a593Smuzhiyun#endif 1220*4882a593Smuzhiyun___ 1221*4882a593Smuzhiyun} } 1222*4882a593Smuzhiyun$code.=<<___; 1223*4882a593Smuzhiyun.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" 1224*4882a593Smuzhiyun.align 2 1225*4882a593Smuzhiyun___ 1226*4882a593Smuzhiyun 1227*4882a593Smuzhiyunforeach (split("\n",$code)) { 1228*4882a593Smuzhiyun s/\`([^\`]*)\`/eval $1/geo; 1229*4882a593Smuzhiyun 1230*4882a593Smuzhiyun s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 1231*4882a593Smuzhiyun s/\bret\b/bx lr/go or 1232*4882a593Smuzhiyun s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 1233*4882a593Smuzhiyun 1234*4882a593Smuzhiyun print $_,"\n"; 1235*4882a593Smuzhiyun} 1236*4882a593Smuzhiyunclose STDOUT; # enforce flush 1237