1*4882a593Smuzhiyun#!/usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# ==================================================================== 5*4882a593Smuzhiyun# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL 6*4882a593Smuzhiyun# project. 7*4882a593Smuzhiyun# ==================================================================== 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun# Poly1305 hash for MIPS. 10*4882a593Smuzhiyun# 11*4882a593Smuzhiyun# May 2016 12*4882a593Smuzhiyun# 13*4882a593Smuzhiyun# Numbers are cycles per processed byte with poly1305_blocks alone. 14*4882a593Smuzhiyun# 15*4882a593Smuzhiyun# IALU/gcc 16*4882a593Smuzhiyun# R1x000 ~5.5/+130% (big-endian) 17*4882a593Smuzhiyun# Octeon II 2.50/+70% (little-endian) 18*4882a593Smuzhiyun# 19*4882a593Smuzhiyun# March 2019 20*4882a593Smuzhiyun# 21*4882a593Smuzhiyun# Add 32-bit code path. 22*4882a593Smuzhiyun# 23*4882a593Smuzhiyun# October 2019 24*4882a593Smuzhiyun# 25*4882a593Smuzhiyun# Modulo-scheduling reduction allows to omit dependency chain at the 26*4882a593Smuzhiyun# end of inner loop and improve performance. Also optimize MIPS32R2 27*4882a593Smuzhiyun# code path for MIPS 1004K core. Per René von Dorst's suggestions. 28*4882a593Smuzhiyun# 29*4882a593Smuzhiyun# IALU/gcc 30*4882a593Smuzhiyun# R1x000 ~9.8/? (big-endian) 31*4882a593Smuzhiyun# Octeon II 3.65/+140% (little-endian) 32*4882a593Smuzhiyun# MT7621/1004K 4.75/? (little-endian) 33*4882a593Smuzhiyun# 34*4882a593Smuzhiyun###################################################################### 35*4882a593Smuzhiyun# There is a number of MIPS ABI in use, O32 and N32/64 are most 36*4882a593Smuzhiyun# widely used. Then there is a new contender: NUBI. It appears that if 37*4882a593Smuzhiyun# one picks the latter, it's possible to arrange code in ABI neutral 38*4882a593Smuzhiyun# manner. Therefore let's stick to NUBI register layout: 39*4882a593Smuzhiyun# 40*4882a593Smuzhiyun($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 41*4882a593Smuzhiyun($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 42*4882a593Smuzhiyun($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 43*4882a593Smuzhiyun($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 44*4882a593Smuzhiyun# 45*4882a593Smuzhiyun# The return value is placed in $a0. Following coding rules facilitate 46*4882a593Smuzhiyun# interoperability: 47*4882a593Smuzhiyun# 48*4882a593Smuzhiyun# - never ever touch $tp, "thread pointer", former $gp [o32 can be 49*4882a593Smuzhiyun# excluded from the rule, because it's specified volatile]; 50*4882a593Smuzhiyun# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 51*4882a593Smuzhiyun# old code]; 52*4882a593Smuzhiyun# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 53*4882a593Smuzhiyun# 54*4882a593Smuzhiyun# For reference here is register layout for N32/64 MIPS ABIs: 55*4882a593Smuzhiyun# 56*4882a593Smuzhiyun# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 57*4882a593Smuzhiyun# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 58*4882a593Smuzhiyun# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 59*4882a593Smuzhiyun# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 60*4882a593Smuzhiyun# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 61*4882a593Smuzhiyun# 62*4882a593Smuzhiyun# <appro@openssl.org> 63*4882a593Smuzhiyun# 64*4882a593Smuzhiyun###################################################################### 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; 69*4882a593Smuzhiyun 70*4882a593Smuzhiyunif ($flavour =~ /64|n32/i) {{{ 71*4882a593Smuzhiyun###################################################################### 72*4882a593Smuzhiyun# 64-bit code path 73*4882a593Smuzhiyun# 74*4882a593Smuzhiyun 75*4882a593Smuzhiyunmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 76*4882a593Smuzhiyunmy ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun$code.=<<___; 79*4882a593Smuzhiyun#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ 80*4882a593Smuzhiyun defined(_MIPS_ARCH_MIPS64R6)) \\ 81*4882a593Smuzhiyun && !defined(_MIPS_ARCH_MIPS64R2) 82*4882a593Smuzhiyun# define _MIPS_ARCH_MIPS64R2 83*4882a593Smuzhiyun#endif 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 86*4882a593Smuzhiyun# define dmultu(rs,rt) 87*4882a593Smuzhiyun# define mflo(rd,rs,rt) dmulu rd,rs,rt 88*4882a593Smuzhiyun# define mfhi(rd,rs,rt) dmuhu rd,rs,rt 89*4882a593Smuzhiyun#else 90*4882a593Smuzhiyun# define dmultu(rs,rt) dmultu rs,rt 91*4882a593Smuzhiyun# define mflo(rd,rs,rt) mflo rd 92*4882a593Smuzhiyun# define mfhi(rd,rs,rt) mfhi rd 93*4882a593Smuzhiyun#endif 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun#ifdef __KERNEL__ 96*4882a593Smuzhiyun# define poly1305_init poly1305_init_mips 97*4882a593Smuzhiyun# define poly1305_blocks poly1305_blocks_mips 98*4882a593Smuzhiyun# define poly1305_emit poly1305_emit_mips 99*4882a593Smuzhiyun#endif 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun#if defined(__MIPSEB__) && !defined(MIPSEB) 102*4882a593Smuzhiyun# define MIPSEB 103*4882a593Smuzhiyun#endif 104*4882a593Smuzhiyun 105*4882a593Smuzhiyun#ifdef MIPSEB 106*4882a593Smuzhiyun# define MSB 0 107*4882a593Smuzhiyun# define LSB 7 108*4882a593Smuzhiyun#else 109*4882a593Smuzhiyun# define MSB 7 110*4882a593Smuzhiyun# define LSB 0 111*4882a593Smuzhiyun#endif 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun.text 114*4882a593Smuzhiyun.set noat 115*4882a593Smuzhiyun.set noreorder 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun.align 5 118*4882a593Smuzhiyun.globl poly1305_init 119*4882a593Smuzhiyun.ent poly1305_init 120*4882a593Smuzhiyunpoly1305_init: 121*4882a593Smuzhiyun .frame $sp,0,$ra 122*4882a593Smuzhiyun .set reorder 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun sd $zero,0($ctx) 125*4882a593Smuzhiyun sd $zero,8($ctx) 126*4882a593Smuzhiyun sd $zero,16($ctx) 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun beqz $inp,.Lno_key 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 131*4882a593Smuzhiyun andi $tmp0,$inp,7 # $inp % 8 132*4882a593Smuzhiyun dsubu $inp,$inp,$tmp0 # align $inp 133*4882a593Smuzhiyun sll $tmp0,$tmp0,3 # byte to bit offset 134*4882a593Smuzhiyun ld $in0,0($inp) 135*4882a593Smuzhiyun ld $in1,8($inp) 136*4882a593Smuzhiyun beqz $tmp0,.Laligned_key 137*4882a593Smuzhiyun ld $tmp2,16($inp) 138*4882a593Smuzhiyun 139*4882a593Smuzhiyun subu $tmp1,$zero,$tmp0 140*4882a593Smuzhiyun# ifdef MIPSEB 141*4882a593Smuzhiyun dsllv $in0,$in0,$tmp0 142*4882a593Smuzhiyun dsrlv $tmp3,$in1,$tmp1 143*4882a593Smuzhiyun dsllv $in1,$in1,$tmp0 144*4882a593Smuzhiyun dsrlv $tmp2,$tmp2,$tmp1 145*4882a593Smuzhiyun# else 146*4882a593Smuzhiyun dsrlv $in0,$in0,$tmp0 147*4882a593Smuzhiyun dsllv $tmp3,$in1,$tmp1 148*4882a593Smuzhiyun dsrlv $in1,$in1,$tmp0 149*4882a593Smuzhiyun dsllv $tmp2,$tmp2,$tmp1 150*4882a593Smuzhiyun# endif 151*4882a593Smuzhiyun or $in0,$in0,$tmp3 152*4882a593Smuzhiyun or $in1,$in1,$tmp2 153*4882a593Smuzhiyun.Laligned_key: 154*4882a593Smuzhiyun#else 155*4882a593Smuzhiyun ldl $in0,0+MSB($inp) 156*4882a593Smuzhiyun ldl $in1,8+MSB($inp) 157*4882a593Smuzhiyun ldr $in0,0+LSB($inp) 158*4882a593Smuzhiyun ldr $in1,8+LSB($inp) 159*4882a593Smuzhiyun#endif 160*4882a593Smuzhiyun#ifdef MIPSEB 161*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS64R2) 162*4882a593Smuzhiyun dsbh $in0,$in0 # byte swap 163*4882a593Smuzhiyun dsbh $in1,$in1 164*4882a593Smuzhiyun dshd $in0,$in0 165*4882a593Smuzhiyun dshd $in1,$in1 166*4882a593Smuzhiyun# else 167*4882a593Smuzhiyun ori $tmp0,$zero,0xFF 168*4882a593Smuzhiyun dsll $tmp2,$tmp0,32 169*4882a593Smuzhiyun or $tmp0,$tmp2 # 0x000000FF000000FF 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun and $tmp1,$in0,$tmp0 # byte swap 172*4882a593Smuzhiyun and $tmp3,$in1,$tmp0 173*4882a593Smuzhiyun dsrl $tmp2,$in0,24 174*4882a593Smuzhiyun dsrl $tmp4,$in1,24 175*4882a593Smuzhiyun dsll $tmp1,24 176*4882a593Smuzhiyun dsll $tmp3,24 177*4882a593Smuzhiyun and $tmp2,$tmp0 178*4882a593Smuzhiyun and $tmp4,$tmp0 179*4882a593Smuzhiyun dsll $tmp0,8 # 0x0000FF000000FF00 180*4882a593Smuzhiyun or $tmp1,$tmp2 181*4882a593Smuzhiyun or $tmp3,$tmp4 182*4882a593Smuzhiyun and $tmp2,$in0,$tmp0 183*4882a593Smuzhiyun and $tmp4,$in1,$tmp0 184*4882a593Smuzhiyun dsrl $in0,8 185*4882a593Smuzhiyun dsrl $in1,8 186*4882a593Smuzhiyun dsll $tmp2,8 187*4882a593Smuzhiyun dsll $tmp4,8 188*4882a593Smuzhiyun and $in0,$tmp0 189*4882a593Smuzhiyun and $in1,$tmp0 190*4882a593Smuzhiyun or $tmp1,$tmp2 191*4882a593Smuzhiyun or $tmp3,$tmp4 192*4882a593Smuzhiyun or $in0,$tmp1 193*4882a593Smuzhiyun or $in1,$tmp3 194*4882a593Smuzhiyun dsrl $tmp1,$in0,32 195*4882a593Smuzhiyun dsrl $tmp3,$in1,32 196*4882a593Smuzhiyun dsll $in0,32 197*4882a593Smuzhiyun dsll $in1,32 198*4882a593Smuzhiyun or $in0,$tmp1 199*4882a593Smuzhiyun or $in1,$tmp3 200*4882a593Smuzhiyun# endif 201*4882a593Smuzhiyun#endif 202*4882a593Smuzhiyun li $tmp0,1 203*4882a593Smuzhiyun dsll $tmp0,32 # 0x0000000100000000 204*4882a593Smuzhiyun daddiu $tmp0,-63 # 0x00000000ffffffc1 205*4882a593Smuzhiyun dsll $tmp0,28 # 0x0ffffffc10000000 206*4882a593Smuzhiyun daddiu $tmp0,-1 # 0x0ffffffc0fffffff 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun and $in0,$tmp0 209*4882a593Smuzhiyun daddiu $tmp0,-3 # 0x0ffffffc0ffffffc 210*4882a593Smuzhiyun and $in1,$tmp0 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun sd $in0,24($ctx) 213*4882a593Smuzhiyun dsrl $tmp0,$in1,2 214*4882a593Smuzhiyun sd $in1,32($ctx) 215*4882a593Smuzhiyun daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) 216*4882a593Smuzhiyun sd $tmp0,40($ctx) 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun.Lno_key: 219*4882a593Smuzhiyun li $v0,0 # return 0 220*4882a593Smuzhiyun jr $ra 221*4882a593Smuzhiyun.end poly1305_init 222*4882a593Smuzhiyun___ 223*4882a593Smuzhiyun{ 224*4882a593Smuzhiyunmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; 225*4882a593Smuzhiyun 226*4882a593Smuzhiyunmy ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = 227*4882a593Smuzhiyun ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); 228*4882a593Smuzhiyunmy ($shr,$shl) = ($s6,$s7); # used on R6 229*4882a593Smuzhiyun 230*4882a593Smuzhiyun$code.=<<___; 231*4882a593Smuzhiyun.align 5 232*4882a593Smuzhiyun.globl poly1305_blocks 233*4882a593Smuzhiyun.ent poly1305_blocks 234*4882a593Smuzhiyunpoly1305_blocks: 235*4882a593Smuzhiyun .set noreorder 236*4882a593Smuzhiyun dsrl $len,4 # number of complete blocks 237*4882a593Smuzhiyun bnez $len,poly1305_blocks_internal 238*4882a593Smuzhiyun nop 239*4882a593Smuzhiyun jr $ra 240*4882a593Smuzhiyun nop 241*4882a593Smuzhiyun.end poly1305_blocks 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun.align 5 244*4882a593Smuzhiyun.ent poly1305_blocks_internal 245*4882a593Smuzhiyunpoly1305_blocks_internal: 246*4882a593Smuzhiyun .set noreorder 247*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 248*4882a593Smuzhiyun .frame $sp,8*8,$ra 249*4882a593Smuzhiyun .mask $SAVED_REGS_MASK|0x000c0000,-8 250*4882a593Smuzhiyun dsubu $sp,8*8 251*4882a593Smuzhiyun sd $s7,56($sp) 252*4882a593Smuzhiyun sd $s6,48($sp) 253*4882a593Smuzhiyun#else 254*4882a593Smuzhiyun .frame $sp,6*8,$ra 255*4882a593Smuzhiyun .mask $SAVED_REGS_MASK,-8 256*4882a593Smuzhiyun dsubu $sp,6*8 257*4882a593Smuzhiyun#endif 258*4882a593Smuzhiyun sd $s5,40($sp) 259*4882a593Smuzhiyun sd $s4,32($sp) 260*4882a593Smuzhiyun___ 261*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 262*4882a593Smuzhiyun sd $s3,24($sp) 263*4882a593Smuzhiyun sd $s2,16($sp) 264*4882a593Smuzhiyun sd $s1,8($sp) 265*4882a593Smuzhiyun sd $s0,0($sp) 266*4882a593Smuzhiyun___ 267*4882a593Smuzhiyun$code.=<<___; 268*4882a593Smuzhiyun .set reorder 269*4882a593Smuzhiyun 270*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 271*4882a593Smuzhiyun andi $shr,$inp,7 272*4882a593Smuzhiyun dsubu $inp,$inp,$shr # align $inp 273*4882a593Smuzhiyun sll $shr,$shr,3 # byte to bit offset 274*4882a593Smuzhiyun subu $shl,$zero,$shr 275*4882a593Smuzhiyun#endif 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun ld $h0,0($ctx) # load hash value 278*4882a593Smuzhiyun ld $h1,8($ctx) 279*4882a593Smuzhiyun ld $h2,16($ctx) 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun ld $r0,24($ctx) # load key 282*4882a593Smuzhiyun ld $r1,32($ctx) 283*4882a593Smuzhiyun ld $rs1,40($ctx) 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun dsll $len,4 286*4882a593Smuzhiyun daddu $len,$inp # end of buffer 287*4882a593Smuzhiyun b .Loop 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun.align 4 290*4882a593Smuzhiyun.Loop: 291*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 292*4882a593Smuzhiyun ld $in0,0($inp) # load input 293*4882a593Smuzhiyun ld $in1,8($inp) 294*4882a593Smuzhiyun beqz $shr,.Laligned_inp 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun ld $tmp2,16($inp) 297*4882a593Smuzhiyun# ifdef MIPSEB 298*4882a593Smuzhiyun dsllv $in0,$in0,$shr 299*4882a593Smuzhiyun dsrlv $tmp3,$in1,$shl 300*4882a593Smuzhiyun dsllv $in1,$in1,$shr 301*4882a593Smuzhiyun dsrlv $tmp2,$tmp2,$shl 302*4882a593Smuzhiyun# else 303*4882a593Smuzhiyun dsrlv $in0,$in0,$shr 304*4882a593Smuzhiyun dsllv $tmp3,$in1,$shl 305*4882a593Smuzhiyun dsrlv $in1,$in1,$shr 306*4882a593Smuzhiyun dsllv $tmp2,$tmp2,$shl 307*4882a593Smuzhiyun# endif 308*4882a593Smuzhiyun or $in0,$in0,$tmp3 309*4882a593Smuzhiyun or $in1,$in1,$tmp2 310*4882a593Smuzhiyun.Laligned_inp: 311*4882a593Smuzhiyun#else 312*4882a593Smuzhiyun ldl $in0,0+MSB($inp) # load input 313*4882a593Smuzhiyun ldl $in1,8+MSB($inp) 314*4882a593Smuzhiyun ldr $in0,0+LSB($inp) 315*4882a593Smuzhiyun ldr $in1,8+LSB($inp) 316*4882a593Smuzhiyun#endif 317*4882a593Smuzhiyun daddiu $inp,16 318*4882a593Smuzhiyun#ifdef MIPSEB 319*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS64R2) 320*4882a593Smuzhiyun dsbh $in0,$in0 # byte swap 321*4882a593Smuzhiyun dsbh $in1,$in1 322*4882a593Smuzhiyun dshd $in0,$in0 323*4882a593Smuzhiyun dshd $in1,$in1 324*4882a593Smuzhiyun# else 325*4882a593Smuzhiyun ori $tmp0,$zero,0xFF 326*4882a593Smuzhiyun dsll $tmp2,$tmp0,32 327*4882a593Smuzhiyun or $tmp0,$tmp2 # 0x000000FF000000FF 328*4882a593Smuzhiyun 329*4882a593Smuzhiyun and $tmp1,$in0,$tmp0 # byte swap 330*4882a593Smuzhiyun and $tmp3,$in1,$tmp0 331*4882a593Smuzhiyun dsrl $tmp2,$in0,24 332*4882a593Smuzhiyun dsrl $tmp4,$in1,24 333*4882a593Smuzhiyun dsll $tmp1,24 334*4882a593Smuzhiyun dsll $tmp3,24 335*4882a593Smuzhiyun and $tmp2,$tmp0 336*4882a593Smuzhiyun and $tmp4,$tmp0 337*4882a593Smuzhiyun dsll $tmp0,8 # 0x0000FF000000FF00 338*4882a593Smuzhiyun or $tmp1,$tmp2 339*4882a593Smuzhiyun or $tmp3,$tmp4 340*4882a593Smuzhiyun and $tmp2,$in0,$tmp0 341*4882a593Smuzhiyun and $tmp4,$in1,$tmp0 342*4882a593Smuzhiyun dsrl $in0,8 343*4882a593Smuzhiyun dsrl $in1,8 344*4882a593Smuzhiyun dsll $tmp2,8 345*4882a593Smuzhiyun dsll $tmp4,8 346*4882a593Smuzhiyun and $in0,$tmp0 347*4882a593Smuzhiyun and $in1,$tmp0 348*4882a593Smuzhiyun or $tmp1,$tmp2 349*4882a593Smuzhiyun or $tmp3,$tmp4 350*4882a593Smuzhiyun or $in0,$tmp1 351*4882a593Smuzhiyun or $in1,$tmp3 352*4882a593Smuzhiyun dsrl $tmp1,$in0,32 353*4882a593Smuzhiyun dsrl $tmp3,$in1,32 354*4882a593Smuzhiyun dsll $in0,32 355*4882a593Smuzhiyun dsll $in1,32 356*4882a593Smuzhiyun or $in0,$tmp1 357*4882a593Smuzhiyun or $in1,$tmp3 358*4882a593Smuzhiyun# endif 359*4882a593Smuzhiyun#endif 360*4882a593Smuzhiyun dsrl $tmp1,$h2,2 # modulo-scheduled reduction 361*4882a593Smuzhiyun andi $h2,$h2,3 362*4882a593Smuzhiyun dsll $tmp0,$tmp1,2 363*4882a593Smuzhiyun 364*4882a593Smuzhiyun daddu $d0,$h0,$in0 # accumulate input 365*4882a593Smuzhiyun daddu $tmp1,$tmp0 366*4882a593Smuzhiyun sltu $tmp0,$d0,$h0 367*4882a593Smuzhiyun daddu $d0,$d0,$tmp1 # ... and residue 368*4882a593Smuzhiyun sltu $tmp1,$d0,$tmp1 369*4882a593Smuzhiyun daddu $d1,$h1,$in1 370*4882a593Smuzhiyun daddu $tmp0,$tmp1 371*4882a593Smuzhiyun sltu $tmp1,$d1,$h1 372*4882a593Smuzhiyun daddu $d1,$tmp0 373*4882a593Smuzhiyun 374*4882a593Smuzhiyun dmultu ($r0,$d0) # h0*r0 375*4882a593Smuzhiyun daddu $d2,$h2,$padbit 376*4882a593Smuzhiyun sltu $tmp0,$d1,$tmp0 377*4882a593Smuzhiyun mflo ($h0,$r0,$d0) 378*4882a593Smuzhiyun mfhi ($h1,$r0,$d0) 379*4882a593Smuzhiyun 380*4882a593Smuzhiyun dmultu ($rs1,$d1) # h1*5*r1 381*4882a593Smuzhiyun daddu $d2,$tmp1 382*4882a593Smuzhiyun daddu $d2,$tmp0 383*4882a593Smuzhiyun mflo ($tmp0,$rs1,$d1) 384*4882a593Smuzhiyun mfhi ($tmp1,$rs1,$d1) 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun dmultu ($r1,$d0) # h0*r1 387*4882a593Smuzhiyun mflo ($tmp2,$r1,$d0) 388*4882a593Smuzhiyun mfhi ($h2,$r1,$d0) 389*4882a593Smuzhiyun daddu $h0,$tmp0 390*4882a593Smuzhiyun daddu $h1,$tmp1 391*4882a593Smuzhiyun sltu $tmp0,$h0,$tmp0 392*4882a593Smuzhiyun 393*4882a593Smuzhiyun dmultu ($r0,$d1) # h1*r0 394*4882a593Smuzhiyun daddu $h1,$tmp0 395*4882a593Smuzhiyun daddu $h1,$tmp2 396*4882a593Smuzhiyun mflo ($tmp0,$r0,$d1) 397*4882a593Smuzhiyun mfhi ($tmp1,$r0,$d1) 398*4882a593Smuzhiyun 399*4882a593Smuzhiyun dmultu ($rs1,$d2) # h2*5*r1 400*4882a593Smuzhiyun sltu $tmp2,$h1,$tmp2 401*4882a593Smuzhiyun daddu $h2,$tmp2 402*4882a593Smuzhiyun mflo ($tmp2,$rs1,$d2) 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun dmultu ($r0,$d2) # h2*r0 405*4882a593Smuzhiyun daddu $h1,$tmp0 406*4882a593Smuzhiyun daddu $h2,$tmp1 407*4882a593Smuzhiyun mflo ($tmp3,$r0,$d2) 408*4882a593Smuzhiyun sltu $tmp0,$h1,$tmp0 409*4882a593Smuzhiyun daddu $h2,$tmp0 410*4882a593Smuzhiyun 411*4882a593Smuzhiyun daddu $h1,$tmp2 412*4882a593Smuzhiyun sltu $tmp2,$h1,$tmp2 413*4882a593Smuzhiyun daddu $h2,$tmp2 414*4882a593Smuzhiyun daddu $h2,$tmp3 415*4882a593Smuzhiyun 416*4882a593Smuzhiyun bne $inp,$len,.Loop 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun sd $h0,0($ctx) # store hash value 419*4882a593Smuzhiyun sd $h1,8($ctx) 420*4882a593Smuzhiyun sd $h2,16($ctx) 421*4882a593Smuzhiyun 422*4882a593Smuzhiyun .set noreorder 423*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 424*4882a593Smuzhiyun ld $s7,56($sp) 425*4882a593Smuzhiyun ld $s6,48($sp) 426*4882a593Smuzhiyun#endif 427*4882a593Smuzhiyun ld $s5,40($sp) # epilogue 428*4882a593Smuzhiyun ld $s4,32($sp) 429*4882a593Smuzhiyun___ 430*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue 431*4882a593Smuzhiyun ld $s3,24($sp) 432*4882a593Smuzhiyun ld $s2,16($sp) 433*4882a593Smuzhiyun ld $s1,8($sp) 434*4882a593Smuzhiyun ld $s0,0($sp) 435*4882a593Smuzhiyun___ 436*4882a593Smuzhiyun$code.=<<___; 437*4882a593Smuzhiyun jr $ra 438*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6) 439*4882a593Smuzhiyun daddu $sp,8*8 440*4882a593Smuzhiyun#else 441*4882a593Smuzhiyun daddu $sp,6*8 442*4882a593Smuzhiyun#endif 443*4882a593Smuzhiyun.end poly1305_blocks_internal 444*4882a593Smuzhiyun___ 445*4882a593Smuzhiyun} 446*4882a593Smuzhiyun{ 447*4882a593Smuzhiyunmy ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun$code.=<<___; 450*4882a593Smuzhiyun.align 5 451*4882a593Smuzhiyun.globl poly1305_emit 452*4882a593Smuzhiyun.ent poly1305_emit 453*4882a593Smuzhiyunpoly1305_emit: 454*4882a593Smuzhiyun .frame $sp,0,$ra 455*4882a593Smuzhiyun .set reorder 456*4882a593Smuzhiyun 457*4882a593Smuzhiyun ld $tmp2,16($ctx) 458*4882a593Smuzhiyun ld $tmp0,0($ctx) 459*4882a593Smuzhiyun ld $tmp1,8($ctx) 460*4882a593Smuzhiyun 461*4882a593Smuzhiyun li $in0,-4 # final reduction 462*4882a593Smuzhiyun dsrl $in1,$tmp2,2 463*4882a593Smuzhiyun and $in0,$tmp2 464*4882a593Smuzhiyun andi $tmp2,$tmp2,3 465*4882a593Smuzhiyun daddu $in0,$in1 466*4882a593Smuzhiyun 467*4882a593Smuzhiyun daddu $tmp0,$tmp0,$in0 468*4882a593Smuzhiyun sltu $in1,$tmp0,$in0 469*4882a593Smuzhiyun daddiu $in0,$tmp0,5 # compare to modulus 470*4882a593Smuzhiyun daddu $tmp1,$tmp1,$in1 471*4882a593Smuzhiyun sltiu $tmp3,$in0,5 472*4882a593Smuzhiyun sltu $tmp4,$tmp1,$in1 473*4882a593Smuzhiyun daddu $in1,$tmp1,$tmp3 474*4882a593Smuzhiyun daddu $tmp2,$tmp2,$tmp4 475*4882a593Smuzhiyun sltu $tmp3,$in1,$tmp3 476*4882a593Smuzhiyun daddu $tmp2,$tmp2,$tmp3 477*4882a593Smuzhiyun 478*4882a593Smuzhiyun dsrl $tmp2,2 # see if it carried/borrowed 479*4882a593Smuzhiyun dsubu $tmp2,$zero,$tmp2 480*4882a593Smuzhiyun 481*4882a593Smuzhiyun xor $in0,$tmp0 482*4882a593Smuzhiyun xor $in1,$tmp1 483*4882a593Smuzhiyun and $in0,$tmp2 484*4882a593Smuzhiyun and $in1,$tmp2 485*4882a593Smuzhiyun xor $in0,$tmp0 486*4882a593Smuzhiyun xor $in1,$tmp1 487*4882a593Smuzhiyun 488*4882a593Smuzhiyun lwu $tmp0,0($nonce) # load nonce 489*4882a593Smuzhiyun lwu $tmp1,4($nonce) 490*4882a593Smuzhiyun lwu $tmp2,8($nonce) 491*4882a593Smuzhiyun lwu $tmp3,12($nonce) 492*4882a593Smuzhiyun dsll $tmp1,32 493*4882a593Smuzhiyun dsll $tmp3,32 494*4882a593Smuzhiyun or $tmp0,$tmp1 495*4882a593Smuzhiyun or $tmp2,$tmp3 496*4882a593Smuzhiyun 497*4882a593Smuzhiyun daddu $in0,$tmp0 # accumulate nonce 498*4882a593Smuzhiyun daddu $in1,$tmp2 499*4882a593Smuzhiyun sltu $tmp0,$in0,$tmp0 500*4882a593Smuzhiyun daddu $in1,$tmp0 501*4882a593Smuzhiyun 502*4882a593Smuzhiyun dsrl $tmp0,$in0,8 # write mac value 503*4882a593Smuzhiyun dsrl $tmp1,$in0,16 504*4882a593Smuzhiyun dsrl $tmp2,$in0,24 505*4882a593Smuzhiyun sb $in0,0($mac) 506*4882a593Smuzhiyun dsrl $tmp3,$in0,32 507*4882a593Smuzhiyun sb $tmp0,1($mac) 508*4882a593Smuzhiyun dsrl $tmp0,$in0,40 509*4882a593Smuzhiyun sb $tmp1,2($mac) 510*4882a593Smuzhiyun dsrl $tmp1,$in0,48 511*4882a593Smuzhiyun sb $tmp2,3($mac) 512*4882a593Smuzhiyun dsrl $tmp2,$in0,56 513*4882a593Smuzhiyun sb $tmp3,4($mac) 514*4882a593Smuzhiyun dsrl $tmp3,$in1,8 515*4882a593Smuzhiyun sb $tmp0,5($mac) 516*4882a593Smuzhiyun dsrl $tmp0,$in1,16 517*4882a593Smuzhiyun sb $tmp1,6($mac) 518*4882a593Smuzhiyun dsrl $tmp1,$in1,24 519*4882a593Smuzhiyun sb $tmp2,7($mac) 520*4882a593Smuzhiyun 521*4882a593Smuzhiyun sb $in1,8($mac) 522*4882a593Smuzhiyun dsrl $tmp2,$in1,32 523*4882a593Smuzhiyun sb $tmp3,9($mac) 524*4882a593Smuzhiyun dsrl $tmp3,$in1,40 525*4882a593Smuzhiyun sb $tmp0,10($mac) 526*4882a593Smuzhiyun dsrl $tmp0,$in1,48 527*4882a593Smuzhiyun sb $tmp1,11($mac) 528*4882a593Smuzhiyun dsrl $tmp1,$in1,56 529*4882a593Smuzhiyun sb $tmp2,12($mac) 530*4882a593Smuzhiyun sb $tmp3,13($mac) 531*4882a593Smuzhiyun sb $tmp0,14($mac) 532*4882a593Smuzhiyun sb $tmp1,15($mac) 533*4882a593Smuzhiyun 534*4882a593Smuzhiyun jr $ra 535*4882a593Smuzhiyun.end poly1305_emit 536*4882a593Smuzhiyun.rdata 537*4882a593Smuzhiyun.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" 538*4882a593Smuzhiyun.align 2 539*4882a593Smuzhiyun___ 540*4882a593Smuzhiyun} 541*4882a593Smuzhiyun}}} else {{{ 542*4882a593Smuzhiyun###################################################################### 543*4882a593Smuzhiyun# 32-bit code path 544*4882a593Smuzhiyun# 545*4882a593Smuzhiyun 546*4882a593Smuzhiyunmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 547*4882a593Smuzhiyunmy ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = 548*4882a593Smuzhiyun ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); 549*4882a593Smuzhiyun 550*4882a593Smuzhiyun$code.=<<___; 551*4882a593Smuzhiyun#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ 552*4882a593Smuzhiyun defined(_MIPS_ARCH_MIPS32R6)) \\ 553*4882a593Smuzhiyun && !defined(_MIPS_ARCH_MIPS32R2) 554*4882a593Smuzhiyun# define _MIPS_ARCH_MIPS32R2 555*4882a593Smuzhiyun#endif 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6) 558*4882a593Smuzhiyun# define multu(rs,rt) 559*4882a593Smuzhiyun# define mflo(rd,rs,rt) mulu rd,rs,rt 560*4882a593Smuzhiyun# define mfhi(rd,rs,rt) muhu rd,rs,rt 561*4882a593Smuzhiyun#else 562*4882a593Smuzhiyun# define multu(rs,rt) multu rs,rt 563*4882a593Smuzhiyun# define mflo(rd,rs,rt) mflo rd 564*4882a593Smuzhiyun# define mfhi(rd,rs,rt) mfhi rd 565*4882a593Smuzhiyun#endif 566*4882a593Smuzhiyun 567*4882a593Smuzhiyun#ifdef __KERNEL__ 568*4882a593Smuzhiyun# define poly1305_init poly1305_init_mips 569*4882a593Smuzhiyun# define poly1305_blocks poly1305_blocks_mips 570*4882a593Smuzhiyun# define poly1305_emit poly1305_emit_mips 571*4882a593Smuzhiyun#endif 572*4882a593Smuzhiyun 573*4882a593Smuzhiyun#if defined(__MIPSEB__) && !defined(MIPSEB) 574*4882a593Smuzhiyun# define MIPSEB 575*4882a593Smuzhiyun#endif 576*4882a593Smuzhiyun 577*4882a593Smuzhiyun#ifdef MIPSEB 578*4882a593Smuzhiyun# define MSB 0 579*4882a593Smuzhiyun# define LSB 3 580*4882a593Smuzhiyun#else 581*4882a593Smuzhiyun# define MSB 3 582*4882a593Smuzhiyun# define LSB 0 583*4882a593Smuzhiyun#endif 584*4882a593Smuzhiyun 585*4882a593Smuzhiyun.text 586*4882a593Smuzhiyun.set noat 587*4882a593Smuzhiyun.set noreorder 588*4882a593Smuzhiyun 589*4882a593Smuzhiyun.align 5 590*4882a593Smuzhiyun.globl poly1305_init 591*4882a593Smuzhiyun.ent poly1305_init 592*4882a593Smuzhiyunpoly1305_init: 593*4882a593Smuzhiyun .frame $sp,0,$ra 594*4882a593Smuzhiyun .set reorder 595*4882a593Smuzhiyun 596*4882a593Smuzhiyun sw $zero,0($ctx) 597*4882a593Smuzhiyun sw $zero,4($ctx) 598*4882a593Smuzhiyun sw $zero,8($ctx) 599*4882a593Smuzhiyun sw $zero,12($ctx) 600*4882a593Smuzhiyun sw $zero,16($ctx) 601*4882a593Smuzhiyun 602*4882a593Smuzhiyun beqz $inp,.Lno_key 603*4882a593Smuzhiyun 604*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6) 605*4882a593Smuzhiyun andi $tmp0,$inp,3 # $inp % 4 606*4882a593Smuzhiyun subu $inp,$inp,$tmp0 # align $inp 607*4882a593Smuzhiyun sll $tmp0,$tmp0,3 # byte to bit offset 608*4882a593Smuzhiyun lw $in0,0($inp) 609*4882a593Smuzhiyun lw $in1,4($inp) 610*4882a593Smuzhiyun lw $in2,8($inp) 611*4882a593Smuzhiyun lw $in3,12($inp) 612*4882a593Smuzhiyun beqz $tmp0,.Laligned_key 613*4882a593Smuzhiyun 614*4882a593Smuzhiyun lw $tmp2,16($inp) 615*4882a593Smuzhiyun subu $tmp1,$zero,$tmp0 616*4882a593Smuzhiyun# ifdef MIPSEB 617*4882a593Smuzhiyun sllv $in0,$in0,$tmp0 618*4882a593Smuzhiyun srlv $tmp3,$in1,$tmp1 619*4882a593Smuzhiyun sllv $in1,$in1,$tmp0 620*4882a593Smuzhiyun or $in0,$in0,$tmp3 621*4882a593Smuzhiyun srlv $tmp3,$in2,$tmp1 622*4882a593Smuzhiyun sllv $in2,$in2,$tmp0 623*4882a593Smuzhiyun or $in1,$in1,$tmp3 624*4882a593Smuzhiyun srlv $tmp3,$in3,$tmp1 625*4882a593Smuzhiyun sllv $in3,$in3,$tmp0 626*4882a593Smuzhiyun or $in2,$in2,$tmp3 627*4882a593Smuzhiyun srlv $tmp2,$tmp2,$tmp1 628*4882a593Smuzhiyun or $in3,$in3,$tmp2 629*4882a593Smuzhiyun# else 630*4882a593Smuzhiyun srlv $in0,$in0,$tmp0 631*4882a593Smuzhiyun sllv $tmp3,$in1,$tmp1 632*4882a593Smuzhiyun srlv $in1,$in1,$tmp0 633*4882a593Smuzhiyun or $in0,$in0,$tmp3 634*4882a593Smuzhiyun sllv $tmp3,$in2,$tmp1 635*4882a593Smuzhiyun srlv $in2,$in2,$tmp0 636*4882a593Smuzhiyun or $in1,$in1,$tmp3 637*4882a593Smuzhiyun sllv $tmp3,$in3,$tmp1 638*4882a593Smuzhiyun srlv $in3,$in3,$tmp0 639*4882a593Smuzhiyun or $in2,$in2,$tmp3 640*4882a593Smuzhiyun sllv $tmp2,$tmp2,$tmp1 641*4882a593Smuzhiyun or $in3,$in3,$tmp2 642*4882a593Smuzhiyun# endif 643*4882a593Smuzhiyun.Laligned_key: 644*4882a593Smuzhiyun#else 645*4882a593Smuzhiyun lwl $in0,0+MSB($inp) 646*4882a593Smuzhiyun lwl $in1,4+MSB($inp) 647*4882a593Smuzhiyun lwl $in2,8+MSB($inp) 648*4882a593Smuzhiyun lwl $in3,12+MSB($inp) 649*4882a593Smuzhiyun lwr $in0,0+LSB($inp) 650*4882a593Smuzhiyun lwr $in1,4+LSB($inp) 651*4882a593Smuzhiyun lwr $in2,8+LSB($inp) 652*4882a593Smuzhiyun lwr $in3,12+LSB($inp) 653*4882a593Smuzhiyun#endif 654*4882a593Smuzhiyun#ifdef MIPSEB 655*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS32R2) 656*4882a593Smuzhiyun wsbh $in0,$in0 # byte swap 657*4882a593Smuzhiyun wsbh $in1,$in1 658*4882a593Smuzhiyun wsbh $in2,$in2 659*4882a593Smuzhiyun wsbh $in3,$in3 660*4882a593Smuzhiyun rotr $in0,$in0,16 661*4882a593Smuzhiyun rotr $in1,$in1,16 662*4882a593Smuzhiyun rotr $in2,$in2,16 663*4882a593Smuzhiyun rotr $in3,$in3,16 664*4882a593Smuzhiyun# else 665*4882a593Smuzhiyun srl $tmp0,$in0,24 # byte swap 666*4882a593Smuzhiyun srl $tmp1,$in0,8 667*4882a593Smuzhiyun andi $tmp2,$in0,0xFF00 668*4882a593Smuzhiyun sll $in0,$in0,24 669*4882a593Smuzhiyun andi $tmp1,0xFF00 670*4882a593Smuzhiyun sll $tmp2,$tmp2,8 671*4882a593Smuzhiyun or $in0,$tmp0 672*4882a593Smuzhiyun srl $tmp0,$in1,24 673*4882a593Smuzhiyun or $tmp1,$tmp2 674*4882a593Smuzhiyun srl $tmp2,$in1,8 675*4882a593Smuzhiyun or $in0,$tmp1 676*4882a593Smuzhiyun andi $tmp1,$in1,0xFF00 677*4882a593Smuzhiyun sll $in1,$in1,24 678*4882a593Smuzhiyun andi $tmp2,0xFF00 679*4882a593Smuzhiyun sll $tmp1,$tmp1,8 680*4882a593Smuzhiyun or $in1,$tmp0 681*4882a593Smuzhiyun srl $tmp0,$in2,24 682*4882a593Smuzhiyun or $tmp2,$tmp1 683*4882a593Smuzhiyun srl $tmp1,$in2,8 684*4882a593Smuzhiyun or $in1,$tmp2 685*4882a593Smuzhiyun andi $tmp2,$in2,0xFF00 686*4882a593Smuzhiyun sll $in2,$in2,24 687*4882a593Smuzhiyun andi $tmp1,0xFF00 688*4882a593Smuzhiyun sll $tmp2,$tmp2,8 689*4882a593Smuzhiyun or $in2,$tmp0 690*4882a593Smuzhiyun srl $tmp0,$in3,24 691*4882a593Smuzhiyun or $tmp1,$tmp2 692*4882a593Smuzhiyun srl $tmp2,$in3,8 693*4882a593Smuzhiyun or $in2,$tmp1 694*4882a593Smuzhiyun andi $tmp1,$in3,0xFF00 695*4882a593Smuzhiyun sll $in3,$in3,24 696*4882a593Smuzhiyun andi $tmp2,0xFF00 697*4882a593Smuzhiyun sll $tmp1,$tmp1,8 698*4882a593Smuzhiyun or $in3,$tmp0 699*4882a593Smuzhiyun or $tmp2,$tmp1 700*4882a593Smuzhiyun or $in3,$tmp2 701*4882a593Smuzhiyun# endif 702*4882a593Smuzhiyun#endif 703*4882a593Smuzhiyun lui $tmp0,0x0fff 704*4882a593Smuzhiyun ori $tmp0,0xffff # 0x0fffffff 705*4882a593Smuzhiyun and $in0,$in0,$tmp0 706*4882a593Smuzhiyun subu $tmp0,3 # 0x0ffffffc 707*4882a593Smuzhiyun and $in1,$in1,$tmp0 708*4882a593Smuzhiyun and $in2,$in2,$tmp0 709*4882a593Smuzhiyun and $in3,$in3,$tmp0 710*4882a593Smuzhiyun 711*4882a593Smuzhiyun sw $in0,20($ctx) 712*4882a593Smuzhiyun sw $in1,24($ctx) 713*4882a593Smuzhiyun sw $in2,28($ctx) 714*4882a593Smuzhiyun sw $in3,32($ctx) 715*4882a593Smuzhiyun 716*4882a593Smuzhiyun srl $tmp1,$in1,2 717*4882a593Smuzhiyun srl $tmp2,$in2,2 718*4882a593Smuzhiyun srl $tmp3,$in3,2 719*4882a593Smuzhiyun addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) 720*4882a593Smuzhiyun addu $in2,$in2,$tmp2 721*4882a593Smuzhiyun addu $in3,$in3,$tmp3 722*4882a593Smuzhiyun sw $in1,36($ctx) 723*4882a593Smuzhiyun sw $in2,40($ctx) 724*4882a593Smuzhiyun sw $in3,44($ctx) 725*4882a593Smuzhiyun.Lno_key: 726*4882a593Smuzhiyun li $v0,0 727*4882a593Smuzhiyun jr $ra 728*4882a593Smuzhiyun.end poly1305_init 729*4882a593Smuzhiyun___ 730*4882a593Smuzhiyun{ 731*4882a593Smuzhiyunmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; 732*4882a593Smuzhiyun 733*4882a593Smuzhiyunmy ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = 734*4882a593Smuzhiyun ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); 735*4882a593Smuzhiyunmy ($d0,$d1,$d2,$d3) = 736*4882a593Smuzhiyun ($a4,$a5,$a6,$a7); 737*4882a593Smuzhiyunmy $shr = $t2; # used on R6 738*4882a593Smuzhiyunmy $one = $t2; # used on R2 739*4882a593Smuzhiyun 740*4882a593Smuzhiyun$code.=<<___; 741*4882a593Smuzhiyun.globl poly1305_blocks 742*4882a593Smuzhiyun.align 5 743*4882a593Smuzhiyun.ent poly1305_blocks 744*4882a593Smuzhiyunpoly1305_blocks: 745*4882a593Smuzhiyun .frame $sp,16*4,$ra 746*4882a593Smuzhiyun .mask $SAVED_REGS_MASK,-4 747*4882a593Smuzhiyun .set noreorder 748*4882a593Smuzhiyun subu $sp, $sp,4*12 749*4882a593Smuzhiyun sw $s11,4*11($sp) 750*4882a593Smuzhiyun sw $s10,4*10($sp) 751*4882a593Smuzhiyun sw $s9, 4*9($sp) 752*4882a593Smuzhiyun sw $s8, 4*8($sp) 753*4882a593Smuzhiyun sw $s7, 4*7($sp) 754*4882a593Smuzhiyun sw $s6, 4*6($sp) 755*4882a593Smuzhiyun sw $s5, 4*5($sp) 756*4882a593Smuzhiyun sw $s4, 4*4($sp) 757*4882a593Smuzhiyun___ 758*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 759*4882a593Smuzhiyun sw $s3, 4*3($sp) 760*4882a593Smuzhiyun sw $s2, 4*2($sp) 761*4882a593Smuzhiyun sw $s1, 4*1($sp) 762*4882a593Smuzhiyun sw $s0, 4*0($sp) 763*4882a593Smuzhiyun___ 764*4882a593Smuzhiyun$code.=<<___; 765*4882a593Smuzhiyun .set reorder 766*4882a593Smuzhiyun 767*4882a593Smuzhiyun srl $len,4 # number of complete blocks 768*4882a593Smuzhiyun li $one,1 769*4882a593Smuzhiyun beqz $len,.Labort 770*4882a593Smuzhiyun 771*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6) 772*4882a593Smuzhiyun andi $shr,$inp,3 773*4882a593Smuzhiyun subu $inp,$inp,$shr # align $inp 774*4882a593Smuzhiyun sll $shr,$shr,3 # byte to bit offset 775*4882a593Smuzhiyun#endif 776*4882a593Smuzhiyun 777*4882a593Smuzhiyun lw $h0,0($ctx) # load hash value 778*4882a593Smuzhiyun lw $h1,4($ctx) 779*4882a593Smuzhiyun lw $h2,8($ctx) 780*4882a593Smuzhiyun lw $h3,12($ctx) 781*4882a593Smuzhiyun lw $h4,16($ctx) 782*4882a593Smuzhiyun 783*4882a593Smuzhiyun lw $r0,20($ctx) # load key 784*4882a593Smuzhiyun lw $r1,24($ctx) 785*4882a593Smuzhiyun lw $r2,28($ctx) 786*4882a593Smuzhiyun lw $r3,32($ctx) 787*4882a593Smuzhiyun lw $rs1,36($ctx) 788*4882a593Smuzhiyun lw $rs2,40($ctx) 789*4882a593Smuzhiyun lw $rs3,44($ctx) 790*4882a593Smuzhiyun 791*4882a593Smuzhiyun sll $len,4 792*4882a593Smuzhiyun addu $len,$len,$inp # end of buffer 793*4882a593Smuzhiyun b .Loop 794*4882a593Smuzhiyun 795*4882a593Smuzhiyun.align 4 796*4882a593Smuzhiyun.Loop: 797*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6) 798*4882a593Smuzhiyun lw $d0,0($inp) # load input 799*4882a593Smuzhiyun lw $d1,4($inp) 800*4882a593Smuzhiyun lw $d2,8($inp) 801*4882a593Smuzhiyun lw $d3,12($inp) 802*4882a593Smuzhiyun beqz $shr,.Laligned_inp 803*4882a593Smuzhiyun 804*4882a593Smuzhiyun lw $t0,16($inp) 805*4882a593Smuzhiyun subu $t1,$zero,$shr 806*4882a593Smuzhiyun# ifdef MIPSEB 807*4882a593Smuzhiyun sllv $d0,$d0,$shr 808*4882a593Smuzhiyun srlv $at,$d1,$t1 809*4882a593Smuzhiyun sllv $d1,$d1,$shr 810*4882a593Smuzhiyun or $d0,$d0,$at 811*4882a593Smuzhiyun srlv $at,$d2,$t1 812*4882a593Smuzhiyun sllv $d2,$d2,$shr 813*4882a593Smuzhiyun or $d1,$d1,$at 814*4882a593Smuzhiyun srlv $at,$d3,$t1 815*4882a593Smuzhiyun sllv $d3,$d3,$shr 816*4882a593Smuzhiyun or $d2,$d2,$at 817*4882a593Smuzhiyun srlv $t0,$t0,$t1 818*4882a593Smuzhiyun or $d3,$d3,$t0 819*4882a593Smuzhiyun# else 820*4882a593Smuzhiyun srlv $d0,$d0,$shr 821*4882a593Smuzhiyun sllv $at,$d1,$t1 822*4882a593Smuzhiyun srlv $d1,$d1,$shr 823*4882a593Smuzhiyun or $d0,$d0,$at 824*4882a593Smuzhiyun sllv $at,$d2,$t1 825*4882a593Smuzhiyun srlv $d2,$d2,$shr 826*4882a593Smuzhiyun or $d1,$d1,$at 827*4882a593Smuzhiyun sllv $at,$d3,$t1 828*4882a593Smuzhiyun srlv $d3,$d3,$shr 829*4882a593Smuzhiyun or $d2,$d2,$at 830*4882a593Smuzhiyun sllv $t0,$t0,$t1 831*4882a593Smuzhiyun or $d3,$d3,$t0 832*4882a593Smuzhiyun# endif 833*4882a593Smuzhiyun.Laligned_inp: 834*4882a593Smuzhiyun#else 835*4882a593Smuzhiyun lwl $d0,0+MSB($inp) # load input 836*4882a593Smuzhiyun lwl $d1,4+MSB($inp) 837*4882a593Smuzhiyun lwl $d2,8+MSB($inp) 838*4882a593Smuzhiyun lwl $d3,12+MSB($inp) 839*4882a593Smuzhiyun lwr $d0,0+LSB($inp) 840*4882a593Smuzhiyun lwr $d1,4+LSB($inp) 841*4882a593Smuzhiyun lwr $d2,8+LSB($inp) 842*4882a593Smuzhiyun lwr $d3,12+LSB($inp) 843*4882a593Smuzhiyun#endif 844*4882a593Smuzhiyun#ifdef MIPSEB 845*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS32R2) 846*4882a593Smuzhiyun wsbh $d0,$d0 # byte swap 847*4882a593Smuzhiyun wsbh $d1,$d1 848*4882a593Smuzhiyun wsbh $d2,$d2 849*4882a593Smuzhiyun wsbh $d3,$d3 850*4882a593Smuzhiyun rotr $d0,$d0,16 851*4882a593Smuzhiyun rotr $d1,$d1,16 852*4882a593Smuzhiyun rotr $d2,$d2,16 853*4882a593Smuzhiyun rotr $d3,$d3,16 854*4882a593Smuzhiyun# else 855*4882a593Smuzhiyun srl $at,$d0,24 # byte swap 856*4882a593Smuzhiyun srl $t0,$d0,8 857*4882a593Smuzhiyun andi $t1,$d0,0xFF00 858*4882a593Smuzhiyun sll $d0,$d0,24 859*4882a593Smuzhiyun andi $t0,0xFF00 860*4882a593Smuzhiyun sll $t1,$t1,8 861*4882a593Smuzhiyun or $d0,$at 862*4882a593Smuzhiyun srl $at,$d1,24 863*4882a593Smuzhiyun or $t0,$t1 864*4882a593Smuzhiyun srl $t1,$d1,8 865*4882a593Smuzhiyun or $d0,$t0 866*4882a593Smuzhiyun andi $t0,$d1,0xFF00 867*4882a593Smuzhiyun sll $d1,$d1,24 868*4882a593Smuzhiyun andi $t1,0xFF00 869*4882a593Smuzhiyun sll $t0,$t0,8 870*4882a593Smuzhiyun or $d1,$at 871*4882a593Smuzhiyun srl $at,$d2,24 872*4882a593Smuzhiyun or $t1,$t0 873*4882a593Smuzhiyun srl $t0,$d2,8 874*4882a593Smuzhiyun or $d1,$t1 875*4882a593Smuzhiyun andi $t1,$d2,0xFF00 876*4882a593Smuzhiyun sll $d2,$d2,24 877*4882a593Smuzhiyun andi $t0,0xFF00 878*4882a593Smuzhiyun sll $t1,$t1,8 879*4882a593Smuzhiyun or $d2,$at 880*4882a593Smuzhiyun srl $at,$d3,24 881*4882a593Smuzhiyun or $t0,$t1 882*4882a593Smuzhiyun srl $t1,$d3,8 883*4882a593Smuzhiyun or $d2,$t0 884*4882a593Smuzhiyun andi $t0,$d3,0xFF00 885*4882a593Smuzhiyun sll $d3,$d3,24 886*4882a593Smuzhiyun andi $t1,0xFF00 887*4882a593Smuzhiyun sll $t0,$t0,8 888*4882a593Smuzhiyun or $d3,$at 889*4882a593Smuzhiyun or $t1,$t0 890*4882a593Smuzhiyun or $d3,$t1 891*4882a593Smuzhiyun# endif 892*4882a593Smuzhiyun#endif 893*4882a593Smuzhiyun srl $t0,$h4,2 # modulo-scheduled reduction 894*4882a593Smuzhiyun andi $h4,$h4,3 895*4882a593Smuzhiyun sll $at,$t0,2 896*4882a593Smuzhiyun 897*4882a593Smuzhiyun addu $d0,$d0,$h0 # accumulate input 898*4882a593Smuzhiyun addu $t0,$t0,$at 899*4882a593Smuzhiyun sltu $h0,$d0,$h0 900*4882a593Smuzhiyun addu $d0,$d0,$t0 # ... and residue 901*4882a593Smuzhiyun sltu $at,$d0,$t0 902*4882a593Smuzhiyun 903*4882a593Smuzhiyun addu $d1,$d1,$h1 904*4882a593Smuzhiyun addu $h0,$h0,$at # carry 905*4882a593Smuzhiyun sltu $h1,$d1,$h1 906*4882a593Smuzhiyun addu $d1,$d1,$h0 907*4882a593Smuzhiyun sltu $h0,$d1,$h0 908*4882a593Smuzhiyun 909*4882a593Smuzhiyun addu $d2,$d2,$h2 910*4882a593Smuzhiyun addu $h1,$h1,$h0 # carry 911*4882a593Smuzhiyun sltu $h2,$d2,$h2 912*4882a593Smuzhiyun addu $d2,$d2,$h1 913*4882a593Smuzhiyun sltu $h1,$d2,$h1 914*4882a593Smuzhiyun 915*4882a593Smuzhiyun addu $d3,$d3,$h3 916*4882a593Smuzhiyun addu $h2,$h2,$h1 # carry 917*4882a593Smuzhiyun sltu $h3,$d3,$h3 918*4882a593Smuzhiyun addu $d3,$d3,$h2 919*4882a593Smuzhiyun 920*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) 921*4882a593Smuzhiyun multu $r0,$d0 # d0*r0 922*4882a593Smuzhiyun sltu $h2,$d3,$h2 923*4882a593Smuzhiyun maddu $rs3,$d1 # d1*s3 924*4882a593Smuzhiyun addu $h3,$h3,$h2 # carry 925*4882a593Smuzhiyun maddu $rs2,$d2 # d2*s2 926*4882a593Smuzhiyun addu $h4,$h4,$padbit 927*4882a593Smuzhiyun maddu $rs1,$d3 # d3*s1 928*4882a593Smuzhiyun addu $h4,$h4,$h3 929*4882a593Smuzhiyun mfhi $at 930*4882a593Smuzhiyun mflo $h0 931*4882a593Smuzhiyun 932*4882a593Smuzhiyun multu $r1,$d0 # d0*r1 933*4882a593Smuzhiyun maddu $r0,$d1 # d1*r0 934*4882a593Smuzhiyun maddu $rs3,$d2 # d2*s3 935*4882a593Smuzhiyun maddu $rs2,$d3 # d3*s2 936*4882a593Smuzhiyun maddu $rs1,$h4 # h4*s1 937*4882a593Smuzhiyun maddu $at,$one # hi*1 938*4882a593Smuzhiyun mfhi $at 939*4882a593Smuzhiyun mflo $h1 940*4882a593Smuzhiyun 941*4882a593Smuzhiyun multu $r2,$d0 # d0*r2 942*4882a593Smuzhiyun maddu $r1,$d1 # d1*r1 943*4882a593Smuzhiyun maddu $r0,$d2 # d2*r0 944*4882a593Smuzhiyun maddu $rs3,$d3 # d3*s3 945*4882a593Smuzhiyun maddu $rs2,$h4 # h4*s2 946*4882a593Smuzhiyun maddu $at,$one # hi*1 947*4882a593Smuzhiyun mfhi $at 948*4882a593Smuzhiyun mflo $h2 949*4882a593Smuzhiyun 950*4882a593Smuzhiyun mul $t0,$r0,$h4 # h4*r0 951*4882a593Smuzhiyun 952*4882a593Smuzhiyun multu $r3,$d0 # d0*r3 953*4882a593Smuzhiyun maddu $r2,$d1 # d1*r2 954*4882a593Smuzhiyun maddu $r1,$d2 # d2*r1 955*4882a593Smuzhiyun maddu $r0,$d3 # d3*r0 956*4882a593Smuzhiyun maddu $rs3,$h4 # h4*s3 957*4882a593Smuzhiyun maddu $at,$one # hi*1 958*4882a593Smuzhiyun mfhi $at 959*4882a593Smuzhiyun mflo $h3 960*4882a593Smuzhiyun 961*4882a593Smuzhiyun addiu $inp,$inp,16 962*4882a593Smuzhiyun 963*4882a593Smuzhiyun addu $h4,$t0,$at 964*4882a593Smuzhiyun#else 965*4882a593Smuzhiyun multu ($r0,$d0) # d0*r0 966*4882a593Smuzhiyun mflo ($h0,$r0,$d0) 967*4882a593Smuzhiyun mfhi ($h1,$r0,$d0) 968*4882a593Smuzhiyun 969*4882a593Smuzhiyun sltu $h2,$d3,$h2 970*4882a593Smuzhiyun addu $h3,$h3,$h2 # carry 971*4882a593Smuzhiyun 972*4882a593Smuzhiyun multu ($rs3,$d1) # d1*s3 973*4882a593Smuzhiyun mflo ($at,$rs3,$d1) 974*4882a593Smuzhiyun mfhi ($t0,$rs3,$d1) 975*4882a593Smuzhiyun 976*4882a593Smuzhiyun addu $h4,$h4,$padbit 977*4882a593Smuzhiyun addiu $inp,$inp,16 978*4882a593Smuzhiyun addu $h4,$h4,$h3 979*4882a593Smuzhiyun 980*4882a593Smuzhiyun multu ($rs2,$d2) # d2*s2 981*4882a593Smuzhiyun mflo ($a3,$rs2,$d2) 982*4882a593Smuzhiyun mfhi ($t1,$rs2,$d2) 983*4882a593Smuzhiyun addu $h0,$h0,$at 984*4882a593Smuzhiyun addu $h1,$h1,$t0 985*4882a593Smuzhiyun multu ($rs1,$d3) # d3*s1 986*4882a593Smuzhiyun sltu $at,$h0,$at 987*4882a593Smuzhiyun addu $h1,$h1,$at 988*4882a593Smuzhiyun 989*4882a593Smuzhiyun mflo ($at,$rs1,$d3) 990*4882a593Smuzhiyun mfhi ($t0,$rs1,$d3) 991*4882a593Smuzhiyun addu $h0,$h0,$a3 992*4882a593Smuzhiyun addu $h1,$h1,$t1 993*4882a593Smuzhiyun multu ($r1,$d0) # d0*r1 994*4882a593Smuzhiyun sltu $a3,$h0,$a3 995*4882a593Smuzhiyun addu $h1,$h1,$a3 996*4882a593Smuzhiyun 997*4882a593Smuzhiyun 998*4882a593Smuzhiyun mflo ($a3,$r1,$d0) 999*4882a593Smuzhiyun mfhi ($h2,$r1,$d0) 1000*4882a593Smuzhiyun addu $h0,$h0,$at 1001*4882a593Smuzhiyun addu $h1,$h1,$t0 1002*4882a593Smuzhiyun multu ($r0,$d1) # d1*r0 1003*4882a593Smuzhiyun sltu $at,$h0,$at 1004*4882a593Smuzhiyun addu $h1,$h1,$at 1005*4882a593Smuzhiyun 1006*4882a593Smuzhiyun mflo ($at,$r0,$d1) 1007*4882a593Smuzhiyun mfhi ($t0,$r0,$d1) 1008*4882a593Smuzhiyun addu $h1,$h1,$a3 1009*4882a593Smuzhiyun sltu $a3,$h1,$a3 1010*4882a593Smuzhiyun multu ($rs3,$d2) # d2*s3 1011*4882a593Smuzhiyun addu $h2,$h2,$a3 1012*4882a593Smuzhiyun 1013*4882a593Smuzhiyun mflo ($a3,$rs3,$d2) 1014*4882a593Smuzhiyun mfhi ($t1,$rs3,$d2) 1015*4882a593Smuzhiyun addu $h1,$h1,$at 1016*4882a593Smuzhiyun addu $h2,$h2,$t0 1017*4882a593Smuzhiyun multu ($rs2,$d3) # d3*s2 1018*4882a593Smuzhiyun sltu $at,$h1,$at 1019*4882a593Smuzhiyun addu $h2,$h2,$at 1020*4882a593Smuzhiyun 1021*4882a593Smuzhiyun mflo ($at,$rs2,$d3) 1022*4882a593Smuzhiyun mfhi ($t0,$rs2,$d3) 1023*4882a593Smuzhiyun addu $h1,$h1,$a3 1024*4882a593Smuzhiyun addu $h2,$h2,$t1 1025*4882a593Smuzhiyun multu ($rs1,$h4) # h4*s1 1026*4882a593Smuzhiyun sltu $a3,$h1,$a3 1027*4882a593Smuzhiyun addu $h2,$h2,$a3 1028*4882a593Smuzhiyun 1029*4882a593Smuzhiyun mflo ($a3,$rs1,$h4) 1030*4882a593Smuzhiyun addu $h1,$h1,$at 1031*4882a593Smuzhiyun addu $h2,$h2,$t0 1032*4882a593Smuzhiyun multu ($r2,$d0) # d0*r2 1033*4882a593Smuzhiyun sltu $at,$h1,$at 1034*4882a593Smuzhiyun addu $h2,$h2,$at 1035*4882a593Smuzhiyun 1036*4882a593Smuzhiyun 1037*4882a593Smuzhiyun mflo ($at,$r2,$d0) 1038*4882a593Smuzhiyun mfhi ($h3,$r2,$d0) 1039*4882a593Smuzhiyun addu $h1,$h1,$a3 1040*4882a593Smuzhiyun sltu $a3,$h1,$a3 1041*4882a593Smuzhiyun multu ($r1,$d1) # d1*r1 1042*4882a593Smuzhiyun addu $h2,$h2,$a3 1043*4882a593Smuzhiyun 1044*4882a593Smuzhiyun mflo ($a3,$r1,$d1) 1045*4882a593Smuzhiyun mfhi ($t1,$r1,$d1) 1046*4882a593Smuzhiyun addu $h2,$h2,$at 1047*4882a593Smuzhiyun sltu $at,$h2,$at 1048*4882a593Smuzhiyun multu ($r0,$d2) # d2*r0 1049*4882a593Smuzhiyun addu $h3,$h3,$at 1050*4882a593Smuzhiyun 1051*4882a593Smuzhiyun mflo ($at,$r0,$d2) 1052*4882a593Smuzhiyun mfhi ($t0,$r0,$d2) 1053*4882a593Smuzhiyun addu $h2,$h2,$a3 1054*4882a593Smuzhiyun addu $h3,$h3,$t1 1055*4882a593Smuzhiyun multu ($rs3,$d3) # d3*s3 1056*4882a593Smuzhiyun sltu $a3,$h2,$a3 1057*4882a593Smuzhiyun addu $h3,$h3,$a3 1058*4882a593Smuzhiyun 1059*4882a593Smuzhiyun mflo ($a3,$rs3,$d3) 1060*4882a593Smuzhiyun mfhi ($t1,$rs3,$d3) 1061*4882a593Smuzhiyun addu $h2,$h2,$at 1062*4882a593Smuzhiyun addu $h3,$h3,$t0 1063*4882a593Smuzhiyun multu ($rs2,$h4) # h4*s2 1064*4882a593Smuzhiyun sltu $at,$h2,$at 1065*4882a593Smuzhiyun addu $h3,$h3,$at 1066*4882a593Smuzhiyun 1067*4882a593Smuzhiyun mflo ($at,$rs2,$h4) 1068*4882a593Smuzhiyun addu $h2,$h2,$a3 1069*4882a593Smuzhiyun addu $h3,$h3,$t1 1070*4882a593Smuzhiyun multu ($r3,$d0) # d0*r3 1071*4882a593Smuzhiyun sltu $a3,$h2,$a3 1072*4882a593Smuzhiyun addu $h3,$h3,$a3 1073*4882a593Smuzhiyun 1074*4882a593Smuzhiyun 1075*4882a593Smuzhiyun mflo ($a3,$r3,$d0) 1076*4882a593Smuzhiyun mfhi ($t1,$r3,$d0) 1077*4882a593Smuzhiyun addu $h2,$h2,$at 1078*4882a593Smuzhiyun sltu $at,$h2,$at 1079*4882a593Smuzhiyun multu ($r2,$d1) # d1*r2 1080*4882a593Smuzhiyun addu $h3,$h3,$at 1081*4882a593Smuzhiyun 1082*4882a593Smuzhiyun mflo ($at,$r2,$d1) 1083*4882a593Smuzhiyun mfhi ($t0,$r2,$d1) 1084*4882a593Smuzhiyun addu $h3,$h3,$a3 1085*4882a593Smuzhiyun sltu $a3,$h3,$a3 1086*4882a593Smuzhiyun multu ($r0,$d3) # d3*r0 1087*4882a593Smuzhiyun addu $t1,$t1,$a3 1088*4882a593Smuzhiyun 1089*4882a593Smuzhiyun mflo ($a3,$r0,$d3) 1090*4882a593Smuzhiyun mfhi ($d3,$r0,$d3) 1091*4882a593Smuzhiyun addu $h3,$h3,$at 1092*4882a593Smuzhiyun addu $t1,$t1,$t0 1093*4882a593Smuzhiyun multu ($r1,$d2) # d2*r1 1094*4882a593Smuzhiyun sltu $at,$h3,$at 1095*4882a593Smuzhiyun addu $t1,$t1,$at 1096*4882a593Smuzhiyun 1097*4882a593Smuzhiyun mflo ($at,$r1,$d2) 1098*4882a593Smuzhiyun mfhi ($t0,$r1,$d2) 1099*4882a593Smuzhiyun addu $h3,$h3,$a3 1100*4882a593Smuzhiyun addu $t1,$t1,$d3 1101*4882a593Smuzhiyun multu ($rs3,$h4) # h4*s3 1102*4882a593Smuzhiyun sltu $a3,$h3,$a3 1103*4882a593Smuzhiyun addu $t1,$t1,$a3 1104*4882a593Smuzhiyun 1105*4882a593Smuzhiyun mflo ($a3,$rs3,$h4) 1106*4882a593Smuzhiyun addu $h3,$h3,$at 1107*4882a593Smuzhiyun addu $t1,$t1,$t0 1108*4882a593Smuzhiyun multu ($r0,$h4) # h4*r0 1109*4882a593Smuzhiyun sltu $at,$h3,$at 1110*4882a593Smuzhiyun addu $t1,$t1,$at 1111*4882a593Smuzhiyun 1112*4882a593Smuzhiyun 1113*4882a593Smuzhiyun mflo ($h4,$r0,$h4) 1114*4882a593Smuzhiyun addu $h3,$h3,$a3 1115*4882a593Smuzhiyun sltu $a3,$h3,$a3 1116*4882a593Smuzhiyun addu $t1,$t1,$a3 1117*4882a593Smuzhiyun addu $h4,$h4,$t1 1118*4882a593Smuzhiyun 1119*4882a593Smuzhiyun li $padbit,1 # if we loop, padbit is 1 1120*4882a593Smuzhiyun#endif 1121*4882a593Smuzhiyun bne $inp,$len,.Loop 1122*4882a593Smuzhiyun 1123*4882a593Smuzhiyun sw $h0,0($ctx) # store hash value 1124*4882a593Smuzhiyun sw $h1,4($ctx) 1125*4882a593Smuzhiyun sw $h2,8($ctx) 1126*4882a593Smuzhiyun sw $h3,12($ctx) 1127*4882a593Smuzhiyun sw $h4,16($ctx) 1128*4882a593Smuzhiyun 1129*4882a593Smuzhiyun .set noreorder 1130*4882a593Smuzhiyun.Labort: 1131*4882a593Smuzhiyun lw $s11,4*11($sp) 1132*4882a593Smuzhiyun lw $s10,4*10($sp) 1133*4882a593Smuzhiyun lw $s9, 4*9($sp) 1134*4882a593Smuzhiyun lw $s8, 4*8($sp) 1135*4882a593Smuzhiyun lw $s7, 4*7($sp) 1136*4882a593Smuzhiyun lw $s6, 4*6($sp) 1137*4882a593Smuzhiyun lw $s5, 4*5($sp) 1138*4882a593Smuzhiyun lw $s4, 4*4($sp) 1139*4882a593Smuzhiyun___ 1140*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 1141*4882a593Smuzhiyun lw $s3, 4*3($sp) 1142*4882a593Smuzhiyun lw $s2, 4*2($sp) 1143*4882a593Smuzhiyun lw $s1, 4*1($sp) 1144*4882a593Smuzhiyun lw $s0, 4*0($sp) 1145*4882a593Smuzhiyun___ 1146*4882a593Smuzhiyun$code.=<<___; 1147*4882a593Smuzhiyun jr $ra 1148*4882a593Smuzhiyun addu $sp,$sp,4*12 1149*4882a593Smuzhiyun.end poly1305_blocks 1150*4882a593Smuzhiyun___ 1151*4882a593Smuzhiyun} 1152*4882a593Smuzhiyun{ 1153*4882a593Smuzhiyunmy ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); 1154*4882a593Smuzhiyun 1155*4882a593Smuzhiyun$code.=<<___; 1156*4882a593Smuzhiyun.align 5 1157*4882a593Smuzhiyun.globl poly1305_emit 1158*4882a593Smuzhiyun.ent poly1305_emit 1159*4882a593Smuzhiyunpoly1305_emit: 1160*4882a593Smuzhiyun .frame $sp,0,$ra 1161*4882a593Smuzhiyun .set reorder 1162*4882a593Smuzhiyun 1163*4882a593Smuzhiyun lw $tmp4,16($ctx) 1164*4882a593Smuzhiyun lw $tmp0,0($ctx) 1165*4882a593Smuzhiyun lw $tmp1,4($ctx) 1166*4882a593Smuzhiyun lw $tmp2,8($ctx) 1167*4882a593Smuzhiyun lw $tmp3,12($ctx) 1168*4882a593Smuzhiyun 1169*4882a593Smuzhiyun li $in0,-4 # final reduction 1170*4882a593Smuzhiyun srl $ctx,$tmp4,2 1171*4882a593Smuzhiyun and $in0,$in0,$tmp4 1172*4882a593Smuzhiyun andi $tmp4,$tmp4,3 1173*4882a593Smuzhiyun addu $ctx,$ctx,$in0 1174*4882a593Smuzhiyun 1175*4882a593Smuzhiyun addu $tmp0,$tmp0,$ctx 1176*4882a593Smuzhiyun sltu $ctx,$tmp0,$ctx 1177*4882a593Smuzhiyun addiu $in0,$tmp0,5 # compare to modulus 1178*4882a593Smuzhiyun addu $tmp1,$tmp1,$ctx 1179*4882a593Smuzhiyun sltiu $in1,$in0,5 1180*4882a593Smuzhiyun sltu $ctx,$tmp1,$ctx 1181*4882a593Smuzhiyun addu $in1,$in1,$tmp1 1182*4882a593Smuzhiyun addu $tmp2,$tmp2,$ctx 1183*4882a593Smuzhiyun sltu $in2,$in1,$tmp1 1184*4882a593Smuzhiyun sltu $ctx,$tmp2,$ctx 1185*4882a593Smuzhiyun addu $in2,$in2,$tmp2 1186*4882a593Smuzhiyun addu $tmp3,$tmp3,$ctx 1187*4882a593Smuzhiyun sltu $in3,$in2,$tmp2 1188*4882a593Smuzhiyun sltu $ctx,$tmp3,$ctx 1189*4882a593Smuzhiyun addu $in3,$in3,$tmp3 1190*4882a593Smuzhiyun addu $tmp4,$tmp4,$ctx 1191*4882a593Smuzhiyun sltu $ctx,$in3,$tmp3 1192*4882a593Smuzhiyun addu $ctx,$tmp4 1193*4882a593Smuzhiyun 1194*4882a593Smuzhiyun srl $ctx,2 # see if it carried/borrowed 1195*4882a593Smuzhiyun subu $ctx,$zero,$ctx 1196*4882a593Smuzhiyun 1197*4882a593Smuzhiyun xor $in0,$tmp0 1198*4882a593Smuzhiyun xor $in1,$tmp1 1199*4882a593Smuzhiyun xor $in2,$tmp2 1200*4882a593Smuzhiyun xor $in3,$tmp3 1201*4882a593Smuzhiyun and $in0,$ctx 1202*4882a593Smuzhiyun and $in1,$ctx 1203*4882a593Smuzhiyun and $in2,$ctx 1204*4882a593Smuzhiyun and $in3,$ctx 1205*4882a593Smuzhiyun xor $in0,$tmp0 1206*4882a593Smuzhiyun xor $in1,$tmp1 1207*4882a593Smuzhiyun xor $in2,$tmp2 1208*4882a593Smuzhiyun xor $in3,$tmp3 1209*4882a593Smuzhiyun 1210*4882a593Smuzhiyun lw $tmp0,0($nonce) # load nonce 1211*4882a593Smuzhiyun lw $tmp1,4($nonce) 1212*4882a593Smuzhiyun lw $tmp2,8($nonce) 1213*4882a593Smuzhiyun lw $tmp3,12($nonce) 1214*4882a593Smuzhiyun 1215*4882a593Smuzhiyun addu $in0,$tmp0 # accumulate nonce 1216*4882a593Smuzhiyun sltu $ctx,$in0,$tmp0 1217*4882a593Smuzhiyun 1218*4882a593Smuzhiyun addu $in1,$tmp1 1219*4882a593Smuzhiyun sltu $tmp1,$in1,$tmp1 1220*4882a593Smuzhiyun addu $in1,$ctx 1221*4882a593Smuzhiyun sltu $ctx,$in1,$ctx 1222*4882a593Smuzhiyun addu $ctx,$tmp1 1223*4882a593Smuzhiyun 1224*4882a593Smuzhiyun addu $in2,$tmp2 1225*4882a593Smuzhiyun sltu $tmp2,$in2,$tmp2 1226*4882a593Smuzhiyun addu $in2,$ctx 1227*4882a593Smuzhiyun sltu $ctx,$in2,$ctx 1228*4882a593Smuzhiyun addu $ctx,$tmp2 1229*4882a593Smuzhiyun 1230*4882a593Smuzhiyun addu $in3,$tmp3 1231*4882a593Smuzhiyun addu $in3,$ctx 1232*4882a593Smuzhiyun 1233*4882a593Smuzhiyun srl $tmp0,$in0,8 # write mac value 1234*4882a593Smuzhiyun srl $tmp1,$in0,16 1235*4882a593Smuzhiyun srl $tmp2,$in0,24 1236*4882a593Smuzhiyun sb $in0, 0($mac) 1237*4882a593Smuzhiyun sb $tmp0,1($mac) 1238*4882a593Smuzhiyun srl $tmp0,$in1,8 1239*4882a593Smuzhiyun sb $tmp1,2($mac) 1240*4882a593Smuzhiyun srl $tmp1,$in1,16 1241*4882a593Smuzhiyun sb $tmp2,3($mac) 1242*4882a593Smuzhiyun srl $tmp2,$in1,24 1243*4882a593Smuzhiyun sb $in1, 4($mac) 1244*4882a593Smuzhiyun sb $tmp0,5($mac) 1245*4882a593Smuzhiyun srl $tmp0,$in2,8 1246*4882a593Smuzhiyun sb $tmp1,6($mac) 1247*4882a593Smuzhiyun srl $tmp1,$in2,16 1248*4882a593Smuzhiyun sb $tmp2,7($mac) 1249*4882a593Smuzhiyun srl $tmp2,$in2,24 1250*4882a593Smuzhiyun sb $in2, 8($mac) 1251*4882a593Smuzhiyun sb $tmp0,9($mac) 1252*4882a593Smuzhiyun srl $tmp0,$in3,8 1253*4882a593Smuzhiyun sb $tmp1,10($mac) 1254*4882a593Smuzhiyun srl $tmp1,$in3,16 1255*4882a593Smuzhiyun sb $tmp2,11($mac) 1256*4882a593Smuzhiyun srl $tmp2,$in3,24 1257*4882a593Smuzhiyun sb $in3, 12($mac) 1258*4882a593Smuzhiyun sb $tmp0,13($mac) 1259*4882a593Smuzhiyun sb $tmp1,14($mac) 1260*4882a593Smuzhiyun sb $tmp2,15($mac) 1261*4882a593Smuzhiyun 1262*4882a593Smuzhiyun jr $ra 1263*4882a593Smuzhiyun.end poly1305_emit 1264*4882a593Smuzhiyun.rdata 1265*4882a593Smuzhiyun.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" 1266*4882a593Smuzhiyun.align 2 1267*4882a593Smuzhiyun___ 1268*4882a593Smuzhiyun} 1269*4882a593Smuzhiyun}}} 1270*4882a593Smuzhiyun 1271*4882a593Smuzhiyun$output=pop and open STDOUT,">$output"; 1272*4882a593Smuzhiyunprint $code; 1273*4882a593Smuzhiyunclose STDOUT; 1274