1*4882a593Smuzhiyun#!/usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# ==================================================================== 5*4882a593Smuzhiyun# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 6*4882a593Smuzhiyun# project. 7*4882a593Smuzhiyun# ==================================================================== 8*4882a593Smuzhiyun# 9*4882a593Smuzhiyun# This module implements Poly1305 hash for ARMv8. 10*4882a593Smuzhiyun# 11*4882a593Smuzhiyun# June 2015 12*4882a593Smuzhiyun# 13*4882a593Smuzhiyun# Numbers are cycles per processed byte with poly1305_blocks alone. 14*4882a593Smuzhiyun# 15*4882a593Smuzhiyun# IALU/gcc-4.9 NEON 16*4882a593Smuzhiyun# 17*4882a593Smuzhiyun# Apple A7 1.86/+5% 0.72 18*4882a593Smuzhiyun# Cortex-A53 2.69/+58% 1.47 19*4882a593Smuzhiyun# Cortex-A57 2.70/+7% 1.14 20*4882a593Smuzhiyun# Denver 1.64/+50% 1.18(*) 21*4882a593Smuzhiyun# X-Gene 2.13/+68% 2.27 22*4882a593Smuzhiyun# Mongoose 1.77/+75% 1.12 23*4882a593Smuzhiyun# Kryo 2.70/+55% 1.13 24*4882a593Smuzhiyun# ThunderX2 1.17/+95% 1.36 25*4882a593Smuzhiyun# 26*4882a593Smuzhiyun# (*) estimate based on resources availability is less than 1.0, 27*4882a593Smuzhiyun# i.e. measured result is worse than expected, presumably binary 28*4882a593Smuzhiyun# translator is not almighty; 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun$flavour=shift; 31*4882a593Smuzhiyun$output=shift; 32*4882a593Smuzhiyun 33*4882a593Smuzhiyunif ($flavour && $flavour ne "void") { 34*4882a593Smuzhiyun $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35*4882a593Smuzhiyun ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 36*4882a593Smuzhiyun ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 37*4882a593Smuzhiyun die "can't locate arm-xlate.pl"; 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun open STDOUT,"| \"$^X\" $xlate $flavour $output"; 40*4882a593Smuzhiyun} else { 41*4882a593Smuzhiyun open STDOUT,">$output"; 42*4882a593Smuzhiyun} 43*4882a593Smuzhiyun 44*4882a593Smuzhiyunmy ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 45*4882a593Smuzhiyunmy ($mac,$nonce)=($inp,$len); 46*4882a593Smuzhiyun 47*4882a593Smuzhiyunmy ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun$code.=<<___; 50*4882a593Smuzhiyun#ifndef __KERNEL__ 51*4882a593Smuzhiyun# include "arm_arch.h" 52*4882a593Smuzhiyun.extern OPENSSL_armcap_P 53*4882a593Smuzhiyun#endif 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun.text 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun// forward "declarations" are required for Apple 58*4882a593Smuzhiyun.globl poly1305_blocks 59*4882a593Smuzhiyun.globl poly1305_emit 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun.globl poly1305_init 62*4882a593Smuzhiyun.type poly1305_init,%function 63*4882a593Smuzhiyun.align 5 64*4882a593Smuzhiyunpoly1305_init: 65*4882a593Smuzhiyun cmp $inp,xzr 66*4882a593Smuzhiyun stp xzr,xzr,[$ctx] // zero hash value 67*4882a593Smuzhiyun stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun csel x0,xzr,x0,eq 70*4882a593Smuzhiyun b.eq .Lno_key 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun#ifndef __KERNEL__ 73*4882a593Smuzhiyun adrp x17,OPENSSL_armcap_P 74*4882a593Smuzhiyun ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 75*4882a593Smuzhiyun#endif 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun ldp $r0,$r1,[$inp] // load key 78*4882a593Smuzhiyun mov $s1,#0xfffffffc0fffffff 79*4882a593Smuzhiyun movk $s1,#0x0fff,lsl#48 80*4882a593Smuzhiyun#ifdef __AARCH64EB__ 81*4882a593Smuzhiyun rev $r0,$r0 // flip bytes 82*4882a593Smuzhiyun rev $r1,$r1 83*4882a593Smuzhiyun#endif 84*4882a593Smuzhiyun and $r0,$r0,$s1 // &=0ffffffc0fffffff 85*4882a593Smuzhiyun and $s1,$s1,#-4 86*4882a593Smuzhiyun and $r1,$r1,$s1 // &=0ffffffc0ffffffc 87*4882a593Smuzhiyun mov w#$s1,#-1 88*4882a593Smuzhiyun stp $r0,$r1,[$ctx,#32] // save key value 89*4882a593Smuzhiyun str w#$s1,[$ctx,#48] // impossible key power value 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun#ifndef __KERNEL__ 92*4882a593Smuzhiyun tst w17,#ARMV7_NEON 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun adr $d0,.Lpoly1305_blocks 95*4882a593Smuzhiyun adr $r0,.Lpoly1305_blocks_neon 96*4882a593Smuzhiyun adr $d1,.Lpoly1305_emit 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun csel $d0,$d0,$r0,eq 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun# ifdef __ILP32__ 101*4882a593Smuzhiyun stp w#$d0,w#$d1,[$len] 102*4882a593Smuzhiyun# else 103*4882a593Smuzhiyun stp $d0,$d1,[$len] 104*4882a593Smuzhiyun# endif 105*4882a593Smuzhiyun#endif 106*4882a593Smuzhiyun mov x0,#1 107*4882a593Smuzhiyun.Lno_key: 108*4882a593Smuzhiyun ret 109*4882a593Smuzhiyun.size poly1305_init,.-poly1305_init 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun.type poly1305_blocks,%function 112*4882a593Smuzhiyun.align 5 113*4882a593Smuzhiyunpoly1305_blocks: 114*4882a593Smuzhiyun.Lpoly1305_blocks: 115*4882a593Smuzhiyun ands $len,$len,#-16 116*4882a593Smuzhiyun b.eq .Lno_data 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun ldp $h0,$h1,[$ctx] // load hash value 119*4882a593Smuzhiyun ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] 120*4882a593Smuzhiyun ldp $r0,$r1,[$ctx,#32] // load key value 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun#ifdef __AARCH64EB__ 123*4882a593Smuzhiyun lsr $d0,$h0,#32 124*4882a593Smuzhiyun mov w#$d1,w#$h0 125*4882a593Smuzhiyun lsr $d2,$h1,#32 126*4882a593Smuzhiyun mov w15,w#$h1 127*4882a593Smuzhiyun lsr x16,$h2,#32 128*4882a593Smuzhiyun#else 129*4882a593Smuzhiyun mov w#$d0,w#$h0 130*4882a593Smuzhiyun lsr $d1,$h0,#32 131*4882a593Smuzhiyun mov w#$d2,w#$h1 132*4882a593Smuzhiyun lsr x15,$h1,#32 133*4882a593Smuzhiyun mov w16,w#$h2 134*4882a593Smuzhiyun#endif 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 137*4882a593Smuzhiyun lsr $d1,$d2,#12 138*4882a593Smuzhiyun adds $d0,$d0,$d2,lsl#52 139*4882a593Smuzhiyun add $d1,$d1,x15,lsl#14 140*4882a593Smuzhiyun adc $d1,$d1,xzr 141*4882a593Smuzhiyun lsr $d2,x16,#24 142*4882a593Smuzhiyun adds $d1,$d1,x16,lsl#40 143*4882a593Smuzhiyun adc $d2,$d2,xzr 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun cmp x17,#0 // is_base2_26? 146*4882a593Smuzhiyun add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 147*4882a593Smuzhiyun csel $h0,$h0,$d0,eq // choose between radixes 148*4882a593Smuzhiyun csel $h1,$h1,$d1,eq 149*4882a593Smuzhiyun csel $h2,$h2,$d2,eq 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun.Loop: 152*4882a593Smuzhiyun ldp $t0,$t1,[$inp],#16 // load input 153*4882a593Smuzhiyun sub $len,$len,#16 154*4882a593Smuzhiyun#ifdef __AARCH64EB__ 155*4882a593Smuzhiyun rev $t0,$t0 156*4882a593Smuzhiyun rev $t1,$t1 157*4882a593Smuzhiyun#endif 158*4882a593Smuzhiyun adds $h0,$h0,$t0 // accumulate input 159*4882a593Smuzhiyun adcs $h1,$h1,$t1 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun mul $d0,$h0,$r0 // h0*r0 162*4882a593Smuzhiyun adc $h2,$h2,$padbit 163*4882a593Smuzhiyun umulh $d1,$h0,$r0 164*4882a593Smuzhiyun 165*4882a593Smuzhiyun mul $t0,$h1,$s1 // h1*5*r1 166*4882a593Smuzhiyun umulh $t1,$h1,$s1 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun adds $d0,$d0,$t0 169*4882a593Smuzhiyun mul $t0,$h0,$r1 // h0*r1 170*4882a593Smuzhiyun adc $d1,$d1,$t1 171*4882a593Smuzhiyun umulh $d2,$h0,$r1 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun adds $d1,$d1,$t0 174*4882a593Smuzhiyun mul $t0,$h1,$r0 // h1*r0 175*4882a593Smuzhiyun adc $d2,$d2,xzr 176*4882a593Smuzhiyun umulh $t1,$h1,$r0 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun adds $d1,$d1,$t0 179*4882a593Smuzhiyun mul $t0,$h2,$s1 // h2*5*r1 180*4882a593Smuzhiyun adc $d2,$d2,$t1 181*4882a593Smuzhiyun mul $t1,$h2,$r0 // h2*r0 182*4882a593Smuzhiyun 183*4882a593Smuzhiyun adds $d1,$d1,$t0 184*4882a593Smuzhiyun adc $d2,$d2,$t1 185*4882a593Smuzhiyun 186*4882a593Smuzhiyun and $t0,$d2,#-4 // final reduction 187*4882a593Smuzhiyun and $h2,$d2,#3 188*4882a593Smuzhiyun add $t0,$t0,$d2,lsr#2 189*4882a593Smuzhiyun adds $h0,$d0,$t0 190*4882a593Smuzhiyun adcs $h1,$d1,xzr 191*4882a593Smuzhiyun adc $h2,$h2,xzr 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun cbnz $len,.Loop 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun stp $h0,$h1,[$ctx] // store hash value 196*4882a593Smuzhiyun stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun.Lno_data: 199*4882a593Smuzhiyun ret 200*4882a593Smuzhiyun.size poly1305_blocks,.-poly1305_blocks 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun.type poly1305_emit,%function 203*4882a593Smuzhiyun.align 5 204*4882a593Smuzhiyunpoly1305_emit: 205*4882a593Smuzhiyun.Lpoly1305_emit: 206*4882a593Smuzhiyun ldp $h0,$h1,[$ctx] // load hash base 2^64 207*4882a593Smuzhiyun ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] 208*4882a593Smuzhiyun ldp $t0,$t1,[$nonce] // load nonce 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun#ifdef __AARCH64EB__ 211*4882a593Smuzhiyun lsr $d0,$h0,#32 212*4882a593Smuzhiyun mov w#$d1,w#$h0 213*4882a593Smuzhiyun lsr $d2,$h1,#32 214*4882a593Smuzhiyun mov w15,w#$h1 215*4882a593Smuzhiyun lsr x16,$h2,#32 216*4882a593Smuzhiyun#else 217*4882a593Smuzhiyun mov w#$d0,w#$h0 218*4882a593Smuzhiyun lsr $d1,$h0,#32 219*4882a593Smuzhiyun mov w#$d2,w#$h1 220*4882a593Smuzhiyun lsr x15,$h1,#32 221*4882a593Smuzhiyun mov w16,w#$h2 222*4882a593Smuzhiyun#endif 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 225*4882a593Smuzhiyun lsr $d1,$d2,#12 226*4882a593Smuzhiyun adds $d0,$d0,$d2,lsl#52 227*4882a593Smuzhiyun add $d1,$d1,x15,lsl#14 228*4882a593Smuzhiyun adc $d1,$d1,xzr 229*4882a593Smuzhiyun lsr $d2,x16,#24 230*4882a593Smuzhiyun adds $d1,$d1,x16,lsl#40 231*4882a593Smuzhiyun adc $d2,$d2,xzr 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun cmp $r0,#0 // is_base2_26? 234*4882a593Smuzhiyun csel $h0,$h0,$d0,eq // choose between radixes 235*4882a593Smuzhiyun csel $h1,$h1,$d1,eq 236*4882a593Smuzhiyun csel $h2,$h2,$d2,eq 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun adds $d0,$h0,#5 // compare to modulus 239*4882a593Smuzhiyun adcs $d1,$h1,xzr 240*4882a593Smuzhiyun adc $d2,$h2,xzr 241*4882a593Smuzhiyun 242*4882a593Smuzhiyun tst $d2,#-4 // see if it's carried/borrowed 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun csel $h0,$h0,$d0,eq 245*4882a593Smuzhiyun csel $h1,$h1,$d1,eq 246*4882a593Smuzhiyun 247*4882a593Smuzhiyun#ifdef __AARCH64EB__ 248*4882a593Smuzhiyun ror $t0,$t0,#32 // flip nonce words 249*4882a593Smuzhiyun ror $t1,$t1,#32 250*4882a593Smuzhiyun#endif 251*4882a593Smuzhiyun adds $h0,$h0,$t0 // accumulate nonce 252*4882a593Smuzhiyun adc $h1,$h1,$t1 253*4882a593Smuzhiyun#ifdef __AARCH64EB__ 254*4882a593Smuzhiyun rev $h0,$h0 // flip output bytes 255*4882a593Smuzhiyun rev $h1,$h1 256*4882a593Smuzhiyun#endif 257*4882a593Smuzhiyun stp $h0,$h1,[$mac] // write result 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun ret 260*4882a593Smuzhiyun.size poly1305_emit,.-poly1305_emit 261*4882a593Smuzhiyun___ 262*4882a593Smuzhiyunmy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 263*4882a593Smuzhiyunmy ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 264*4882a593Smuzhiyunmy ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 265*4882a593Smuzhiyunmy ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 266*4882a593Smuzhiyunmy ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 267*4882a593Smuzhiyunmy ($T0,$T1,$MASK) = map("v$_",(29..31)); 268*4882a593Smuzhiyun 269*4882a593Smuzhiyunmy ($in2,$zeros)=("x16","x17"); 270*4882a593Smuzhiyunmy $is_base2_26 = $zeros; # borrow 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun$code.=<<___; 273*4882a593Smuzhiyun.type poly1305_mult,%function 274*4882a593Smuzhiyun.align 5 275*4882a593Smuzhiyunpoly1305_mult: 276*4882a593Smuzhiyun mul $d0,$h0,$r0 // h0*r0 277*4882a593Smuzhiyun umulh $d1,$h0,$r0 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun mul $t0,$h1,$s1 // h1*5*r1 280*4882a593Smuzhiyun umulh $t1,$h1,$s1 281*4882a593Smuzhiyun 282*4882a593Smuzhiyun adds $d0,$d0,$t0 283*4882a593Smuzhiyun mul $t0,$h0,$r1 // h0*r1 284*4882a593Smuzhiyun adc $d1,$d1,$t1 285*4882a593Smuzhiyun umulh $d2,$h0,$r1 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun adds $d1,$d1,$t0 288*4882a593Smuzhiyun mul $t0,$h1,$r0 // h1*r0 289*4882a593Smuzhiyun adc $d2,$d2,xzr 290*4882a593Smuzhiyun umulh $t1,$h1,$r0 291*4882a593Smuzhiyun 292*4882a593Smuzhiyun adds $d1,$d1,$t0 293*4882a593Smuzhiyun mul $t0,$h2,$s1 // h2*5*r1 294*4882a593Smuzhiyun adc $d2,$d2,$t1 295*4882a593Smuzhiyun mul $t1,$h2,$r0 // h2*r0 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun adds $d1,$d1,$t0 298*4882a593Smuzhiyun adc $d2,$d2,$t1 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun and $t0,$d2,#-4 // final reduction 301*4882a593Smuzhiyun and $h2,$d2,#3 302*4882a593Smuzhiyun add $t0,$t0,$d2,lsr#2 303*4882a593Smuzhiyun adds $h0,$d0,$t0 304*4882a593Smuzhiyun adcs $h1,$d1,xzr 305*4882a593Smuzhiyun adc $h2,$h2,xzr 306*4882a593Smuzhiyun 307*4882a593Smuzhiyun ret 308*4882a593Smuzhiyun.size poly1305_mult,.-poly1305_mult 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun.type poly1305_splat,%function 311*4882a593Smuzhiyun.align 4 312*4882a593Smuzhiyunpoly1305_splat: 313*4882a593Smuzhiyun and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 314*4882a593Smuzhiyun ubfx x13,$h0,#26,#26 315*4882a593Smuzhiyun extr x14,$h1,$h0,#52 316*4882a593Smuzhiyun and x14,x14,#0x03ffffff 317*4882a593Smuzhiyun ubfx x15,$h1,#14,#26 318*4882a593Smuzhiyun extr x16,$h2,$h1,#40 319*4882a593Smuzhiyun 320*4882a593Smuzhiyun str w12,[$ctx,#16*0] // r0 321*4882a593Smuzhiyun add w12,w13,w13,lsl#2 // r1*5 322*4882a593Smuzhiyun str w13,[$ctx,#16*1] // r1 323*4882a593Smuzhiyun add w13,w14,w14,lsl#2 // r2*5 324*4882a593Smuzhiyun str w12,[$ctx,#16*2] // s1 325*4882a593Smuzhiyun str w14,[$ctx,#16*3] // r2 326*4882a593Smuzhiyun add w14,w15,w15,lsl#2 // r3*5 327*4882a593Smuzhiyun str w13,[$ctx,#16*4] // s2 328*4882a593Smuzhiyun str w15,[$ctx,#16*5] // r3 329*4882a593Smuzhiyun add w15,w16,w16,lsl#2 // r4*5 330*4882a593Smuzhiyun str w14,[$ctx,#16*6] // s3 331*4882a593Smuzhiyun str w16,[$ctx,#16*7] // r4 332*4882a593Smuzhiyun str w15,[$ctx,#16*8] // s4 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun ret 335*4882a593Smuzhiyun.size poly1305_splat,.-poly1305_splat 336*4882a593Smuzhiyun 337*4882a593Smuzhiyun#ifdef __KERNEL__ 338*4882a593Smuzhiyun.globl poly1305_blocks_neon 339*4882a593Smuzhiyun#endif 340*4882a593Smuzhiyun.type poly1305_blocks_neon,%function 341*4882a593Smuzhiyun.align 5 342*4882a593Smuzhiyunpoly1305_blocks_neon: 343*4882a593Smuzhiyun.Lpoly1305_blocks_neon: 344*4882a593Smuzhiyun ldr $is_base2_26,[$ctx,#24] 345*4882a593Smuzhiyun cmp $len,#128 346*4882a593Smuzhiyun b.lo .Lpoly1305_blocks 347*4882a593Smuzhiyun 348*4882a593Smuzhiyun .inst 0xd503233f // paciasp 349*4882a593Smuzhiyun stp x29,x30,[sp,#-80]! 350*4882a593Smuzhiyun add x29,sp,#0 351*4882a593Smuzhiyun 352*4882a593Smuzhiyun stp d8,d9,[sp,#16] // meet ABI requirements 353*4882a593Smuzhiyun stp d10,d11,[sp,#32] 354*4882a593Smuzhiyun stp d12,d13,[sp,#48] 355*4882a593Smuzhiyun stp d14,d15,[sp,#64] 356*4882a593Smuzhiyun 357*4882a593Smuzhiyun cbz $is_base2_26,.Lbase2_64_neon 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun ldp w10,w11,[$ctx] // load hash value base 2^26 360*4882a593Smuzhiyun ldp w12,w13,[$ctx,#8] 361*4882a593Smuzhiyun ldr w14,[$ctx,#16] 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun tst $len,#31 364*4882a593Smuzhiyun b.eq .Leven_neon 365*4882a593Smuzhiyun 366*4882a593Smuzhiyun ldp $r0,$r1,[$ctx,#32] // load key value 367*4882a593Smuzhiyun 368*4882a593Smuzhiyun add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 369*4882a593Smuzhiyun lsr $h1,x12,#12 370*4882a593Smuzhiyun adds $h0,$h0,x12,lsl#52 371*4882a593Smuzhiyun add $h1,$h1,x13,lsl#14 372*4882a593Smuzhiyun adc $h1,$h1,xzr 373*4882a593Smuzhiyun lsr $h2,x14,#24 374*4882a593Smuzhiyun adds $h1,$h1,x14,lsl#40 375*4882a593Smuzhiyun adc $d2,$h2,xzr // can be partially reduced... 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun ldp $d0,$d1,[$inp],#16 // load input 378*4882a593Smuzhiyun sub $len,$len,#16 379*4882a593Smuzhiyun add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun#ifdef __AARCH64EB__ 382*4882a593Smuzhiyun rev $d0,$d0 383*4882a593Smuzhiyun rev $d1,$d1 384*4882a593Smuzhiyun#endif 385*4882a593Smuzhiyun adds $h0,$h0,$d0 // accumulate input 386*4882a593Smuzhiyun adcs $h1,$h1,$d1 387*4882a593Smuzhiyun adc $h2,$h2,$padbit 388*4882a593Smuzhiyun 389*4882a593Smuzhiyun bl poly1305_mult 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 392*4882a593Smuzhiyun ubfx x11,$h0,#26,#26 393*4882a593Smuzhiyun extr x12,$h1,$h0,#52 394*4882a593Smuzhiyun and x12,x12,#0x03ffffff 395*4882a593Smuzhiyun ubfx x13,$h1,#14,#26 396*4882a593Smuzhiyun extr x14,$h2,$h1,#40 397*4882a593Smuzhiyun 398*4882a593Smuzhiyun b .Leven_neon 399*4882a593Smuzhiyun 400*4882a593Smuzhiyun.align 4 401*4882a593Smuzhiyun.Lbase2_64_neon: 402*4882a593Smuzhiyun ldp $r0,$r1,[$ctx,#32] // load key value 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun ldp $h0,$h1,[$ctx] // load hash value base 2^64 405*4882a593Smuzhiyun ldr $h2,[$ctx,#16] 406*4882a593Smuzhiyun 407*4882a593Smuzhiyun tst $len,#31 408*4882a593Smuzhiyun b.eq .Linit_neon 409*4882a593Smuzhiyun 410*4882a593Smuzhiyun ldp $d0,$d1,[$inp],#16 // load input 411*4882a593Smuzhiyun sub $len,$len,#16 412*4882a593Smuzhiyun add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 413*4882a593Smuzhiyun#ifdef __AARCH64EB__ 414*4882a593Smuzhiyun rev $d0,$d0 415*4882a593Smuzhiyun rev $d1,$d1 416*4882a593Smuzhiyun#endif 417*4882a593Smuzhiyun adds $h0,$h0,$d0 // accumulate input 418*4882a593Smuzhiyun adcs $h1,$h1,$d1 419*4882a593Smuzhiyun adc $h2,$h2,$padbit 420*4882a593Smuzhiyun 421*4882a593Smuzhiyun bl poly1305_mult 422*4882a593Smuzhiyun 423*4882a593Smuzhiyun.Linit_neon: 424*4882a593Smuzhiyun ldr w17,[$ctx,#48] // first table element 425*4882a593Smuzhiyun and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 426*4882a593Smuzhiyun ubfx x11,$h0,#26,#26 427*4882a593Smuzhiyun extr x12,$h1,$h0,#52 428*4882a593Smuzhiyun and x12,x12,#0x03ffffff 429*4882a593Smuzhiyun ubfx x13,$h1,#14,#26 430*4882a593Smuzhiyun extr x14,$h2,$h1,#40 431*4882a593Smuzhiyun 432*4882a593Smuzhiyun cmp w17,#-1 // is value impossible? 433*4882a593Smuzhiyun b.ne .Leven_neon 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun fmov ${H0},x10 436*4882a593Smuzhiyun fmov ${H1},x11 437*4882a593Smuzhiyun fmov ${H2},x12 438*4882a593Smuzhiyun fmov ${H3},x13 439*4882a593Smuzhiyun fmov ${H4},x14 440*4882a593Smuzhiyun 441*4882a593Smuzhiyun ////////////////////////////////// initialize r^n table 442*4882a593Smuzhiyun mov $h0,$r0 // r^1 443*4882a593Smuzhiyun add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 444*4882a593Smuzhiyun mov $h1,$r1 445*4882a593Smuzhiyun mov $h2,xzr 446*4882a593Smuzhiyun add $ctx,$ctx,#48+12 447*4882a593Smuzhiyun bl poly1305_splat 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun bl poly1305_mult // r^2 450*4882a593Smuzhiyun sub $ctx,$ctx,#4 451*4882a593Smuzhiyun bl poly1305_splat 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun bl poly1305_mult // r^3 454*4882a593Smuzhiyun sub $ctx,$ctx,#4 455*4882a593Smuzhiyun bl poly1305_splat 456*4882a593Smuzhiyun 457*4882a593Smuzhiyun bl poly1305_mult // r^4 458*4882a593Smuzhiyun sub $ctx,$ctx,#4 459*4882a593Smuzhiyun bl poly1305_splat 460*4882a593Smuzhiyun sub $ctx,$ctx,#48 // restore original $ctx 461*4882a593Smuzhiyun b .Ldo_neon 462*4882a593Smuzhiyun 463*4882a593Smuzhiyun.align 4 464*4882a593Smuzhiyun.Leven_neon: 465*4882a593Smuzhiyun fmov ${H0},x10 466*4882a593Smuzhiyun fmov ${H1},x11 467*4882a593Smuzhiyun fmov ${H2},x12 468*4882a593Smuzhiyun fmov ${H3},x13 469*4882a593Smuzhiyun fmov ${H4},x14 470*4882a593Smuzhiyun 471*4882a593Smuzhiyun.Ldo_neon: 472*4882a593Smuzhiyun ldp x8,x12,[$inp,#32] // inp[2:3] 473*4882a593Smuzhiyun subs $len,$len,#64 474*4882a593Smuzhiyun ldp x9,x13,[$inp,#48] 475*4882a593Smuzhiyun add $in2,$inp,#96 476*4882a593Smuzhiyun adr $zeros,.Lzeros 477*4882a593Smuzhiyun 478*4882a593Smuzhiyun lsl $padbit,$padbit,#24 479*4882a593Smuzhiyun add x15,$ctx,#48 480*4882a593Smuzhiyun 481*4882a593Smuzhiyun#ifdef __AARCH64EB__ 482*4882a593Smuzhiyun rev x8,x8 483*4882a593Smuzhiyun rev x12,x12 484*4882a593Smuzhiyun rev x9,x9 485*4882a593Smuzhiyun rev x13,x13 486*4882a593Smuzhiyun#endif 487*4882a593Smuzhiyun and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 488*4882a593Smuzhiyun and x5,x9,#0x03ffffff 489*4882a593Smuzhiyun ubfx x6,x8,#26,#26 490*4882a593Smuzhiyun ubfx x7,x9,#26,#26 491*4882a593Smuzhiyun add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 492*4882a593Smuzhiyun extr x8,x12,x8,#52 493*4882a593Smuzhiyun extr x9,x13,x9,#52 494*4882a593Smuzhiyun add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 495*4882a593Smuzhiyun fmov $IN23_0,x4 496*4882a593Smuzhiyun and x8,x8,#0x03ffffff 497*4882a593Smuzhiyun and x9,x9,#0x03ffffff 498*4882a593Smuzhiyun ubfx x10,x12,#14,#26 499*4882a593Smuzhiyun ubfx x11,x13,#14,#26 500*4882a593Smuzhiyun add x12,$padbit,x12,lsr#40 501*4882a593Smuzhiyun add x13,$padbit,x13,lsr#40 502*4882a593Smuzhiyun add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 503*4882a593Smuzhiyun fmov $IN23_1,x6 504*4882a593Smuzhiyun add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 505*4882a593Smuzhiyun add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 506*4882a593Smuzhiyun fmov $IN23_2,x8 507*4882a593Smuzhiyun fmov $IN23_3,x10 508*4882a593Smuzhiyun fmov $IN23_4,x12 509*4882a593Smuzhiyun 510*4882a593Smuzhiyun ldp x8,x12,[$inp],#16 // inp[0:1] 511*4882a593Smuzhiyun ldp x9,x13,[$inp],#48 512*4882a593Smuzhiyun 513*4882a593Smuzhiyun ld1 {$R0,$R1,$S1,$R2},[x15],#64 514*4882a593Smuzhiyun ld1 {$S2,$R3,$S3,$R4},[x15],#64 515*4882a593Smuzhiyun ld1 {$S4},[x15] 516*4882a593Smuzhiyun 517*4882a593Smuzhiyun#ifdef __AARCH64EB__ 518*4882a593Smuzhiyun rev x8,x8 519*4882a593Smuzhiyun rev x12,x12 520*4882a593Smuzhiyun rev x9,x9 521*4882a593Smuzhiyun rev x13,x13 522*4882a593Smuzhiyun#endif 523*4882a593Smuzhiyun and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 524*4882a593Smuzhiyun and x5,x9,#0x03ffffff 525*4882a593Smuzhiyun ubfx x6,x8,#26,#26 526*4882a593Smuzhiyun ubfx x7,x9,#26,#26 527*4882a593Smuzhiyun add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 528*4882a593Smuzhiyun extr x8,x12,x8,#52 529*4882a593Smuzhiyun extr x9,x13,x9,#52 530*4882a593Smuzhiyun add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 531*4882a593Smuzhiyun fmov $IN01_0,x4 532*4882a593Smuzhiyun and x8,x8,#0x03ffffff 533*4882a593Smuzhiyun and x9,x9,#0x03ffffff 534*4882a593Smuzhiyun ubfx x10,x12,#14,#26 535*4882a593Smuzhiyun ubfx x11,x13,#14,#26 536*4882a593Smuzhiyun add x12,$padbit,x12,lsr#40 537*4882a593Smuzhiyun add x13,$padbit,x13,lsr#40 538*4882a593Smuzhiyun add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 539*4882a593Smuzhiyun fmov $IN01_1,x6 540*4882a593Smuzhiyun add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 541*4882a593Smuzhiyun add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 542*4882a593Smuzhiyun movi $MASK.2d,#-1 543*4882a593Smuzhiyun fmov $IN01_2,x8 544*4882a593Smuzhiyun fmov $IN01_3,x10 545*4882a593Smuzhiyun fmov $IN01_4,x12 546*4882a593Smuzhiyun ushr $MASK.2d,$MASK.2d,#38 547*4882a593Smuzhiyun 548*4882a593Smuzhiyun b.ls .Lskip_loop 549*4882a593Smuzhiyun 550*4882a593Smuzhiyun.align 4 551*4882a593Smuzhiyun.Loop_neon: 552*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 553*4882a593Smuzhiyun // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 554*4882a593Smuzhiyun // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 555*4882a593Smuzhiyun // \___________________/ 556*4882a593Smuzhiyun // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 557*4882a593Smuzhiyun // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 558*4882a593Smuzhiyun // \___________________/ \____________________/ 559*4882a593Smuzhiyun // 560*4882a593Smuzhiyun // Note that we start with inp[2:3]*r^2. This is because it 561*4882a593Smuzhiyun // doesn't depend on reduction in previous iteration. 562*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 563*4882a593Smuzhiyun // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 564*4882a593Smuzhiyun // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 565*4882a593Smuzhiyun // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 566*4882a593Smuzhiyun // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 567*4882a593Smuzhiyun // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 568*4882a593Smuzhiyun 569*4882a593Smuzhiyun subs $len,$len,#64 570*4882a593Smuzhiyun umull $ACC4,$IN23_0,${R4}[2] 571*4882a593Smuzhiyun csel $in2,$zeros,$in2,lo 572*4882a593Smuzhiyun umull $ACC3,$IN23_0,${R3}[2] 573*4882a593Smuzhiyun umull $ACC2,$IN23_0,${R2}[2] 574*4882a593Smuzhiyun ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 575*4882a593Smuzhiyun umull $ACC1,$IN23_0,${R1}[2] 576*4882a593Smuzhiyun ldp x9,x13,[$in2],#48 577*4882a593Smuzhiyun umull $ACC0,$IN23_0,${R0}[2] 578*4882a593Smuzhiyun#ifdef __AARCH64EB__ 579*4882a593Smuzhiyun rev x8,x8 580*4882a593Smuzhiyun rev x12,x12 581*4882a593Smuzhiyun rev x9,x9 582*4882a593Smuzhiyun rev x13,x13 583*4882a593Smuzhiyun#endif 584*4882a593Smuzhiyun 585*4882a593Smuzhiyun umlal $ACC4,$IN23_1,${R3}[2] 586*4882a593Smuzhiyun and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 587*4882a593Smuzhiyun umlal $ACC3,$IN23_1,${R2}[2] 588*4882a593Smuzhiyun and x5,x9,#0x03ffffff 589*4882a593Smuzhiyun umlal $ACC2,$IN23_1,${R1}[2] 590*4882a593Smuzhiyun ubfx x6,x8,#26,#26 591*4882a593Smuzhiyun umlal $ACC1,$IN23_1,${R0}[2] 592*4882a593Smuzhiyun ubfx x7,x9,#26,#26 593*4882a593Smuzhiyun umlal $ACC0,$IN23_1,${S4}[2] 594*4882a593Smuzhiyun add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 595*4882a593Smuzhiyun 596*4882a593Smuzhiyun umlal $ACC4,$IN23_2,${R2}[2] 597*4882a593Smuzhiyun extr x8,x12,x8,#52 598*4882a593Smuzhiyun umlal $ACC3,$IN23_2,${R1}[2] 599*4882a593Smuzhiyun extr x9,x13,x9,#52 600*4882a593Smuzhiyun umlal $ACC2,$IN23_2,${R0}[2] 601*4882a593Smuzhiyun add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 602*4882a593Smuzhiyun umlal $ACC1,$IN23_2,${S4}[2] 603*4882a593Smuzhiyun fmov $IN23_0,x4 604*4882a593Smuzhiyun umlal $ACC0,$IN23_2,${S3}[2] 605*4882a593Smuzhiyun and x8,x8,#0x03ffffff 606*4882a593Smuzhiyun 607*4882a593Smuzhiyun umlal $ACC4,$IN23_3,${R1}[2] 608*4882a593Smuzhiyun and x9,x9,#0x03ffffff 609*4882a593Smuzhiyun umlal $ACC3,$IN23_3,${R0}[2] 610*4882a593Smuzhiyun ubfx x10,x12,#14,#26 611*4882a593Smuzhiyun umlal $ACC2,$IN23_3,${S4}[2] 612*4882a593Smuzhiyun ubfx x11,x13,#14,#26 613*4882a593Smuzhiyun umlal $ACC1,$IN23_3,${S3}[2] 614*4882a593Smuzhiyun add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 615*4882a593Smuzhiyun umlal $ACC0,$IN23_3,${S2}[2] 616*4882a593Smuzhiyun fmov $IN23_1,x6 617*4882a593Smuzhiyun 618*4882a593Smuzhiyun add $IN01_2,$IN01_2,$H2 619*4882a593Smuzhiyun add x12,$padbit,x12,lsr#40 620*4882a593Smuzhiyun umlal $ACC4,$IN23_4,${R0}[2] 621*4882a593Smuzhiyun add x13,$padbit,x13,lsr#40 622*4882a593Smuzhiyun umlal $ACC3,$IN23_4,${S4}[2] 623*4882a593Smuzhiyun add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 624*4882a593Smuzhiyun umlal $ACC2,$IN23_4,${S3}[2] 625*4882a593Smuzhiyun add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 626*4882a593Smuzhiyun umlal $ACC1,$IN23_4,${S2}[2] 627*4882a593Smuzhiyun fmov $IN23_2,x8 628*4882a593Smuzhiyun umlal $ACC0,$IN23_4,${S1}[2] 629*4882a593Smuzhiyun fmov $IN23_3,x10 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 632*4882a593Smuzhiyun // (hash+inp[0:1])*r^4 and accumulate 633*4882a593Smuzhiyun 634*4882a593Smuzhiyun add $IN01_0,$IN01_0,$H0 635*4882a593Smuzhiyun fmov $IN23_4,x12 636*4882a593Smuzhiyun umlal $ACC3,$IN01_2,${R1}[0] 637*4882a593Smuzhiyun ldp x8,x12,[$inp],#16 // inp[0:1] 638*4882a593Smuzhiyun umlal $ACC0,$IN01_2,${S3}[0] 639*4882a593Smuzhiyun ldp x9,x13,[$inp],#48 640*4882a593Smuzhiyun umlal $ACC4,$IN01_2,${R2}[0] 641*4882a593Smuzhiyun umlal $ACC1,$IN01_2,${S4}[0] 642*4882a593Smuzhiyun umlal $ACC2,$IN01_2,${R0}[0] 643*4882a593Smuzhiyun#ifdef __AARCH64EB__ 644*4882a593Smuzhiyun rev x8,x8 645*4882a593Smuzhiyun rev x12,x12 646*4882a593Smuzhiyun rev x9,x9 647*4882a593Smuzhiyun rev x13,x13 648*4882a593Smuzhiyun#endif 649*4882a593Smuzhiyun 650*4882a593Smuzhiyun add $IN01_1,$IN01_1,$H1 651*4882a593Smuzhiyun umlal $ACC3,$IN01_0,${R3}[0] 652*4882a593Smuzhiyun umlal $ACC4,$IN01_0,${R4}[0] 653*4882a593Smuzhiyun and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 654*4882a593Smuzhiyun umlal $ACC2,$IN01_0,${R2}[0] 655*4882a593Smuzhiyun and x5,x9,#0x03ffffff 656*4882a593Smuzhiyun umlal $ACC0,$IN01_0,${R0}[0] 657*4882a593Smuzhiyun ubfx x6,x8,#26,#26 658*4882a593Smuzhiyun umlal $ACC1,$IN01_0,${R1}[0] 659*4882a593Smuzhiyun ubfx x7,x9,#26,#26 660*4882a593Smuzhiyun 661*4882a593Smuzhiyun add $IN01_3,$IN01_3,$H3 662*4882a593Smuzhiyun add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 663*4882a593Smuzhiyun umlal $ACC3,$IN01_1,${R2}[0] 664*4882a593Smuzhiyun extr x8,x12,x8,#52 665*4882a593Smuzhiyun umlal $ACC4,$IN01_1,${R3}[0] 666*4882a593Smuzhiyun extr x9,x13,x9,#52 667*4882a593Smuzhiyun umlal $ACC0,$IN01_1,${S4}[0] 668*4882a593Smuzhiyun add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 669*4882a593Smuzhiyun umlal $ACC2,$IN01_1,${R1}[0] 670*4882a593Smuzhiyun fmov $IN01_0,x4 671*4882a593Smuzhiyun umlal $ACC1,$IN01_1,${R0}[0] 672*4882a593Smuzhiyun and x8,x8,#0x03ffffff 673*4882a593Smuzhiyun 674*4882a593Smuzhiyun add $IN01_4,$IN01_4,$H4 675*4882a593Smuzhiyun and x9,x9,#0x03ffffff 676*4882a593Smuzhiyun umlal $ACC3,$IN01_3,${R0}[0] 677*4882a593Smuzhiyun ubfx x10,x12,#14,#26 678*4882a593Smuzhiyun umlal $ACC0,$IN01_3,${S2}[0] 679*4882a593Smuzhiyun ubfx x11,x13,#14,#26 680*4882a593Smuzhiyun umlal $ACC4,$IN01_3,${R1}[0] 681*4882a593Smuzhiyun add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 682*4882a593Smuzhiyun umlal $ACC1,$IN01_3,${S3}[0] 683*4882a593Smuzhiyun fmov $IN01_1,x6 684*4882a593Smuzhiyun umlal $ACC2,$IN01_3,${S4}[0] 685*4882a593Smuzhiyun add x12,$padbit,x12,lsr#40 686*4882a593Smuzhiyun 687*4882a593Smuzhiyun umlal $ACC3,$IN01_4,${S4}[0] 688*4882a593Smuzhiyun add x13,$padbit,x13,lsr#40 689*4882a593Smuzhiyun umlal $ACC0,$IN01_4,${S1}[0] 690*4882a593Smuzhiyun add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 691*4882a593Smuzhiyun umlal $ACC4,$IN01_4,${R0}[0] 692*4882a593Smuzhiyun add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 693*4882a593Smuzhiyun umlal $ACC1,$IN01_4,${S2}[0] 694*4882a593Smuzhiyun fmov $IN01_2,x8 695*4882a593Smuzhiyun umlal $ACC2,$IN01_4,${S3}[0] 696*4882a593Smuzhiyun fmov $IN01_3,x10 697*4882a593Smuzhiyun fmov $IN01_4,x12 698*4882a593Smuzhiyun 699*4882a593Smuzhiyun ///////////////////////////////////////////////////////////////// 700*4882a593Smuzhiyun // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 701*4882a593Smuzhiyun // and P. Schwabe 702*4882a593Smuzhiyun // 703*4882a593Smuzhiyun // [see discussion in poly1305-armv4 module] 704*4882a593Smuzhiyun 705*4882a593Smuzhiyun ushr $T0.2d,$ACC3,#26 706*4882a593Smuzhiyun xtn $H3,$ACC3 707*4882a593Smuzhiyun ushr $T1.2d,$ACC0,#26 708*4882a593Smuzhiyun and $ACC0,$ACC0,$MASK.2d 709*4882a593Smuzhiyun add $ACC4,$ACC4,$T0.2d // h3 -> h4 710*4882a593Smuzhiyun bic $H3,#0xfc,lsl#24 // &=0x03ffffff 711*4882a593Smuzhiyun add $ACC1,$ACC1,$T1.2d // h0 -> h1 712*4882a593Smuzhiyun 713*4882a593Smuzhiyun ushr $T0.2d,$ACC4,#26 714*4882a593Smuzhiyun xtn $H4,$ACC4 715*4882a593Smuzhiyun ushr $T1.2d,$ACC1,#26 716*4882a593Smuzhiyun xtn $H1,$ACC1 717*4882a593Smuzhiyun bic $H4,#0xfc,lsl#24 718*4882a593Smuzhiyun add $ACC2,$ACC2,$T1.2d // h1 -> h2 719*4882a593Smuzhiyun 720*4882a593Smuzhiyun add $ACC0,$ACC0,$T0.2d 721*4882a593Smuzhiyun shl $T0.2d,$T0.2d,#2 722*4882a593Smuzhiyun shrn $T1.2s,$ACC2,#26 723*4882a593Smuzhiyun xtn $H2,$ACC2 724*4882a593Smuzhiyun add $ACC0,$ACC0,$T0.2d // h4 -> h0 725*4882a593Smuzhiyun bic $H1,#0xfc,lsl#24 726*4882a593Smuzhiyun add $H3,$H3,$T1.2s // h2 -> h3 727*4882a593Smuzhiyun bic $H2,#0xfc,lsl#24 728*4882a593Smuzhiyun 729*4882a593Smuzhiyun shrn $T0.2s,$ACC0,#26 730*4882a593Smuzhiyun xtn $H0,$ACC0 731*4882a593Smuzhiyun ushr $T1.2s,$H3,#26 732*4882a593Smuzhiyun bic $H3,#0xfc,lsl#24 733*4882a593Smuzhiyun bic $H0,#0xfc,lsl#24 734*4882a593Smuzhiyun add $H1,$H1,$T0.2s // h0 -> h1 735*4882a593Smuzhiyun add $H4,$H4,$T1.2s // h3 -> h4 736*4882a593Smuzhiyun 737*4882a593Smuzhiyun b.hi .Loop_neon 738*4882a593Smuzhiyun 739*4882a593Smuzhiyun.Lskip_loop: 740*4882a593Smuzhiyun dup $IN23_2,${IN23_2}[0] 741*4882a593Smuzhiyun add $IN01_2,$IN01_2,$H2 742*4882a593Smuzhiyun 743*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 744*4882a593Smuzhiyun // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 745*4882a593Smuzhiyun 746*4882a593Smuzhiyun adds $len,$len,#32 747*4882a593Smuzhiyun b.ne .Long_tail 748*4882a593Smuzhiyun 749*4882a593Smuzhiyun dup $IN23_2,${IN01_2}[0] 750*4882a593Smuzhiyun add $IN23_0,$IN01_0,$H0 751*4882a593Smuzhiyun add $IN23_3,$IN01_3,$H3 752*4882a593Smuzhiyun add $IN23_1,$IN01_1,$H1 753*4882a593Smuzhiyun add $IN23_4,$IN01_4,$H4 754*4882a593Smuzhiyun 755*4882a593Smuzhiyun.Long_tail: 756*4882a593Smuzhiyun dup $IN23_0,${IN23_0}[0] 757*4882a593Smuzhiyun umull2 $ACC0,$IN23_2,${S3} 758*4882a593Smuzhiyun umull2 $ACC3,$IN23_2,${R1} 759*4882a593Smuzhiyun umull2 $ACC4,$IN23_2,${R2} 760*4882a593Smuzhiyun umull2 $ACC2,$IN23_2,${R0} 761*4882a593Smuzhiyun umull2 $ACC1,$IN23_2,${S4} 762*4882a593Smuzhiyun 763*4882a593Smuzhiyun dup $IN23_1,${IN23_1}[0] 764*4882a593Smuzhiyun umlal2 $ACC0,$IN23_0,${R0} 765*4882a593Smuzhiyun umlal2 $ACC2,$IN23_0,${R2} 766*4882a593Smuzhiyun umlal2 $ACC3,$IN23_0,${R3} 767*4882a593Smuzhiyun umlal2 $ACC4,$IN23_0,${R4} 768*4882a593Smuzhiyun umlal2 $ACC1,$IN23_0,${R1} 769*4882a593Smuzhiyun 770*4882a593Smuzhiyun dup $IN23_3,${IN23_3}[0] 771*4882a593Smuzhiyun umlal2 $ACC0,$IN23_1,${S4} 772*4882a593Smuzhiyun umlal2 $ACC3,$IN23_1,${R2} 773*4882a593Smuzhiyun umlal2 $ACC2,$IN23_1,${R1} 774*4882a593Smuzhiyun umlal2 $ACC4,$IN23_1,${R3} 775*4882a593Smuzhiyun umlal2 $ACC1,$IN23_1,${R0} 776*4882a593Smuzhiyun 777*4882a593Smuzhiyun dup $IN23_4,${IN23_4}[0] 778*4882a593Smuzhiyun umlal2 $ACC3,$IN23_3,${R0} 779*4882a593Smuzhiyun umlal2 $ACC4,$IN23_3,${R1} 780*4882a593Smuzhiyun umlal2 $ACC0,$IN23_3,${S2} 781*4882a593Smuzhiyun umlal2 $ACC1,$IN23_3,${S3} 782*4882a593Smuzhiyun umlal2 $ACC2,$IN23_3,${S4} 783*4882a593Smuzhiyun 784*4882a593Smuzhiyun umlal2 $ACC3,$IN23_4,${S4} 785*4882a593Smuzhiyun umlal2 $ACC0,$IN23_4,${S1} 786*4882a593Smuzhiyun umlal2 $ACC4,$IN23_4,${R0} 787*4882a593Smuzhiyun umlal2 $ACC1,$IN23_4,${S2} 788*4882a593Smuzhiyun umlal2 $ACC2,$IN23_4,${S3} 789*4882a593Smuzhiyun 790*4882a593Smuzhiyun b.eq .Lshort_tail 791*4882a593Smuzhiyun 792*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 793*4882a593Smuzhiyun // (hash+inp[0:1])*r^4:r^3 and accumulate 794*4882a593Smuzhiyun 795*4882a593Smuzhiyun add $IN01_0,$IN01_0,$H0 796*4882a593Smuzhiyun umlal $ACC3,$IN01_2,${R1} 797*4882a593Smuzhiyun umlal $ACC0,$IN01_2,${S3} 798*4882a593Smuzhiyun umlal $ACC4,$IN01_2,${R2} 799*4882a593Smuzhiyun umlal $ACC1,$IN01_2,${S4} 800*4882a593Smuzhiyun umlal $ACC2,$IN01_2,${R0} 801*4882a593Smuzhiyun 802*4882a593Smuzhiyun add $IN01_1,$IN01_1,$H1 803*4882a593Smuzhiyun umlal $ACC3,$IN01_0,${R3} 804*4882a593Smuzhiyun umlal $ACC0,$IN01_0,${R0} 805*4882a593Smuzhiyun umlal $ACC4,$IN01_0,${R4} 806*4882a593Smuzhiyun umlal $ACC1,$IN01_0,${R1} 807*4882a593Smuzhiyun umlal $ACC2,$IN01_0,${R2} 808*4882a593Smuzhiyun 809*4882a593Smuzhiyun add $IN01_3,$IN01_3,$H3 810*4882a593Smuzhiyun umlal $ACC3,$IN01_1,${R2} 811*4882a593Smuzhiyun umlal $ACC0,$IN01_1,${S4} 812*4882a593Smuzhiyun umlal $ACC4,$IN01_1,${R3} 813*4882a593Smuzhiyun umlal $ACC1,$IN01_1,${R0} 814*4882a593Smuzhiyun umlal $ACC2,$IN01_1,${R1} 815*4882a593Smuzhiyun 816*4882a593Smuzhiyun add $IN01_4,$IN01_4,$H4 817*4882a593Smuzhiyun umlal $ACC3,$IN01_3,${R0} 818*4882a593Smuzhiyun umlal $ACC0,$IN01_3,${S2} 819*4882a593Smuzhiyun umlal $ACC4,$IN01_3,${R1} 820*4882a593Smuzhiyun umlal $ACC1,$IN01_3,${S3} 821*4882a593Smuzhiyun umlal $ACC2,$IN01_3,${S4} 822*4882a593Smuzhiyun 823*4882a593Smuzhiyun umlal $ACC3,$IN01_4,${S4} 824*4882a593Smuzhiyun umlal $ACC0,$IN01_4,${S1} 825*4882a593Smuzhiyun umlal $ACC4,$IN01_4,${R0} 826*4882a593Smuzhiyun umlal $ACC1,$IN01_4,${S2} 827*4882a593Smuzhiyun umlal $ACC2,$IN01_4,${S3} 828*4882a593Smuzhiyun 829*4882a593Smuzhiyun.Lshort_tail: 830*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 831*4882a593Smuzhiyun // horizontal add 832*4882a593Smuzhiyun 833*4882a593Smuzhiyun addp $ACC3,$ACC3,$ACC3 834*4882a593Smuzhiyun ldp d8,d9,[sp,#16] // meet ABI requirements 835*4882a593Smuzhiyun addp $ACC0,$ACC0,$ACC0 836*4882a593Smuzhiyun ldp d10,d11,[sp,#32] 837*4882a593Smuzhiyun addp $ACC4,$ACC4,$ACC4 838*4882a593Smuzhiyun ldp d12,d13,[sp,#48] 839*4882a593Smuzhiyun addp $ACC1,$ACC1,$ACC1 840*4882a593Smuzhiyun ldp d14,d15,[sp,#64] 841*4882a593Smuzhiyun addp $ACC2,$ACC2,$ACC2 842*4882a593Smuzhiyun ldr x30,[sp,#8] 843*4882a593Smuzhiyun 844*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 845*4882a593Smuzhiyun // lazy reduction, but without narrowing 846*4882a593Smuzhiyun 847*4882a593Smuzhiyun ushr $T0.2d,$ACC3,#26 848*4882a593Smuzhiyun and $ACC3,$ACC3,$MASK.2d 849*4882a593Smuzhiyun ushr $T1.2d,$ACC0,#26 850*4882a593Smuzhiyun and $ACC0,$ACC0,$MASK.2d 851*4882a593Smuzhiyun 852*4882a593Smuzhiyun add $ACC4,$ACC4,$T0.2d // h3 -> h4 853*4882a593Smuzhiyun add $ACC1,$ACC1,$T1.2d // h0 -> h1 854*4882a593Smuzhiyun 855*4882a593Smuzhiyun ushr $T0.2d,$ACC4,#26 856*4882a593Smuzhiyun and $ACC4,$ACC4,$MASK.2d 857*4882a593Smuzhiyun ushr $T1.2d,$ACC1,#26 858*4882a593Smuzhiyun and $ACC1,$ACC1,$MASK.2d 859*4882a593Smuzhiyun add $ACC2,$ACC2,$T1.2d // h1 -> h2 860*4882a593Smuzhiyun 861*4882a593Smuzhiyun add $ACC0,$ACC0,$T0.2d 862*4882a593Smuzhiyun shl $T0.2d,$T0.2d,#2 863*4882a593Smuzhiyun ushr $T1.2d,$ACC2,#26 864*4882a593Smuzhiyun and $ACC2,$ACC2,$MASK.2d 865*4882a593Smuzhiyun add $ACC0,$ACC0,$T0.2d // h4 -> h0 866*4882a593Smuzhiyun add $ACC3,$ACC3,$T1.2d // h2 -> h3 867*4882a593Smuzhiyun 868*4882a593Smuzhiyun ushr $T0.2d,$ACC0,#26 869*4882a593Smuzhiyun and $ACC0,$ACC0,$MASK.2d 870*4882a593Smuzhiyun ushr $T1.2d,$ACC3,#26 871*4882a593Smuzhiyun and $ACC3,$ACC3,$MASK.2d 872*4882a593Smuzhiyun add $ACC1,$ACC1,$T0.2d // h0 -> h1 873*4882a593Smuzhiyun add $ACC4,$ACC4,$T1.2d // h3 -> h4 874*4882a593Smuzhiyun 875*4882a593Smuzhiyun //////////////////////////////////////////////////////////////// 876*4882a593Smuzhiyun // write the result, can be partially reduced 877*4882a593Smuzhiyun 878*4882a593Smuzhiyun st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 879*4882a593Smuzhiyun mov x4,#1 880*4882a593Smuzhiyun st1 {$ACC4}[0],[$ctx] 881*4882a593Smuzhiyun str x4,[$ctx,#8] // set is_base2_26 882*4882a593Smuzhiyun 883*4882a593Smuzhiyun ldr x29,[sp],#80 884*4882a593Smuzhiyun .inst 0xd50323bf // autiasp 885*4882a593Smuzhiyun ret 886*4882a593Smuzhiyun.size poly1305_blocks_neon,.-poly1305_blocks_neon 887*4882a593Smuzhiyun 888*4882a593Smuzhiyun.align 5 889*4882a593Smuzhiyun.Lzeros: 890*4882a593Smuzhiyun.long 0,0,0,0,0,0,0,0 891*4882a593Smuzhiyun.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" 892*4882a593Smuzhiyun.align 2 893*4882a593Smuzhiyun#if !defined(__KERNEL__) && !defined(_WIN64) 894*4882a593Smuzhiyun.comm OPENSSL_armcap_P,4,4 895*4882a593Smuzhiyun.hidden OPENSSL_armcap_P 896*4882a593Smuzhiyun#endif 897*4882a593Smuzhiyun___ 898*4882a593Smuzhiyun 899*4882a593Smuzhiyunforeach (split("\n",$code)) { 900*4882a593Smuzhiyun s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 901*4882a593Smuzhiyun s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 902*4882a593Smuzhiyun (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 903*4882a593Smuzhiyun (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 904*4882a593Smuzhiyun (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 905*4882a593Smuzhiyun (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 906*4882a593Smuzhiyun (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 907*4882a593Smuzhiyun 908*4882a593Smuzhiyun s/\.[124]([sd])\[/.$1\[/; 909*4882a593Smuzhiyun s/w#x([0-9]+)/w$1/g; 910*4882a593Smuzhiyun 911*4882a593Smuzhiyun print $_,"\n"; 912*4882a593Smuzhiyun} 913*4882a593Smuzhiyunclose STDOUT; 914