1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. 4*4882a593Smuzhiyun * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun#define MASK_U32 0x3c 8*4882a593Smuzhiyun#define CHACHA20_BLOCK_SIZE 64 9*4882a593Smuzhiyun#define STACK_SIZE 32 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun#define X0 $t0 12*4882a593Smuzhiyun#define X1 $t1 13*4882a593Smuzhiyun#define X2 $t2 14*4882a593Smuzhiyun#define X3 $t3 15*4882a593Smuzhiyun#define X4 $t4 16*4882a593Smuzhiyun#define X5 $t5 17*4882a593Smuzhiyun#define X6 $t6 18*4882a593Smuzhiyun#define X7 $t7 19*4882a593Smuzhiyun#define X8 $t8 20*4882a593Smuzhiyun#define X9 $t9 21*4882a593Smuzhiyun#define X10 $v1 22*4882a593Smuzhiyun#define X11 $s6 23*4882a593Smuzhiyun#define X12 $s5 24*4882a593Smuzhiyun#define X13 $s4 25*4882a593Smuzhiyun#define X14 $s3 26*4882a593Smuzhiyun#define X15 $s2 27*4882a593Smuzhiyun/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ 28*4882a593Smuzhiyun#define T0 $s1 29*4882a593Smuzhiyun#define T1 $s0 30*4882a593Smuzhiyun#define T(n) T ## n 31*4882a593Smuzhiyun#define X(n) X ## n 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun/* Input arguments */ 34*4882a593Smuzhiyun#define STATE $a0 35*4882a593Smuzhiyun#define OUT $a1 36*4882a593Smuzhiyun#define IN $a2 37*4882a593Smuzhiyun#define BYTES $a3 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun/* Output argument */ 40*4882a593Smuzhiyun/* NONCE[0] is kept in a register and not in memory. 41*4882a593Smuzhiyun * We don't want to touch original value in memory. 42*4882a593Smuzhiyun * Must be incremented every loop iteration. 43*4882a593Smuzhiyun */ 44*4882a593Smuzhiyun#define NONCE_0 $v0 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun/* SAVED_X and SAVED_CA are set in the jump table. 47*4882a593Smuzhiyun * Use regs which are overwritten on exit else we don't leak clear data. 48*4882a593Smuzhiyun * They are used to handling the last bytes which are not multiple of 4. 49*4882a593Smuzhiyun */ 50*4882a593Smuzhiyun#define SAVED_X X15 51*4882a593Smuzhiyun#define SAVED_CA $s7 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun#define IS_UNALIGNED $s7 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 56*4882a593Smuzhiyun#define MSB 0 57*4882a593Smuzhiyun#define LSB 3 58*4882a593Smuzhiyun#define ROTx rotl 59*4882a593Smuzhiyun#define ROTR(n) rotr n, 24 60*4882a593Smuzhiyun#define CPU_TO_LE32(n) \ 61*4882a593Smuzhiyun wsbh n; \ 62*4882a593Smuzhiyun rotr n, 16; 63*4882a593Smuzhiyun#else 64*4882a593Smuzhiyun#define MSB 3 65*4882a593Smuzhiyun#define LSB 0 66*4882a593Smuzhiyun#define ROTx rotr 67*4882a593Smuzhiyun#define CPU_TO_LE32(n) 68*4882a593Smuzhiyun#define ROTR(n) 69*4882a593Smuzhiyun#endif 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun#define FOR_EACH_WORD(x) \ 72*4882a593Smuzhiyun x( 0); \ 73*4882a593Smuzhiyun x( 1); \ 74*4882a593Smuzhiyun x( 2); \ 75*4882a593Smuzhiyun x( 3); \ 76*4882a593Smuzhiyun x( 4); \ 77*4882a593Smuzhiyun x( 5); \ 78*4882a593Smuzhiyun x( 6); \ 79*4882a593Smuzhiyun x( 7); \ 80*4882a593Smuzhiyun x( 8); \ 81*4882a593Smuzhiyun x( 9); \ 82*4882a593Smuzhiyun x(10); \ 83*4882a593Smuzhiyun x(11); \ 84*4882a593Smuzhiyun x(12); \ 85*4882a593Smuzhiyun x(13); \ 86*4882a593Smuzhiyun x(14); \ 87*4882a593Smuzhiyun x(15); 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun#define FOR_EACH_WORD_REV(x) \ 90*4882a593Smuzhiyun x(15); \ 91*4882a593Smuzhiyun x(14); \ 92*4882a593Smuzhiyun x(13); \ 93*4882a593Smuzhiyun x(12); \ 94*4882a593Smuzhiyun x(11); \ 95*4882a593Smuzhiyun x(10); \ 96*4882a593Smuzhiyun x( 9); \ 97*4882a593Smuzhiyun x( 8); \ 98*4882a593Smuzhiyun x( 7); \ 99*4882a593Smuzhiyun x( 6); \ 100*4882a593Smuzhiyun x( 5); \ 101*4882a593Smuzhiyun x( 4); \ 102*4882a593Smuzhiyun x( 3); \ 103*4882a593Smuzhiyun x( 2); \ 104*4882a593Smuzhiyun x( 1); \ 105*4882a593Smuzhiyun x( 0); 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun#define PLUS_ONE_0 1 108*4882a593Smuzhiyun#define PLUS_ONE_1 2 109*4882a593Smuzhiyun#define PLUS_ONE_2 3 110*4882a593Smuzhiyun#define PLUS_ONE_3 4 111*4882a593Smuzhiyun#define PLUS_ONE_4 5 112*4882a593Smuzhiyun#define PLUS_ONE_5 6 113*4882a593Smuzhiyun#define PLUS_ONE_6 7 114*4882a593Smuzhiyun#define PLUS_ONE_7 8 115*4882a593Smuzhiyun#define PLUS_ONE_8 9 116*4882a593Smuzhiyun#define PLUS_ONE_9 10 117*4882a593Smuzhiyun#define PLUS_ONE_10 11 118*4882a593Smuzhiyun#define PLUS_ONE_11 12 119*4882a593Smuzhiyun#define PLUS_ONE_12 13 120*4882a593Smuzhiyun#define PLUS_ONE_13 14 121*4882a593Smuzhiyun#define PLUS_ONE_14 15 122*4882a593Smuzhiyun#define PLUS_ONE_15 16 123*4882a593Smuzhiyun#define PLUS_ONE(x) PLUS_ONE_ ## x 124*4882a593Smuzhiyun#define _CONCAT3(a,b,c) a ## b ## c 125*4882a593Smuzhiyun#define CONCAT3(a,b,c) _CONCAT3(a,b,c) 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun#define STORE_UNALIGNED(x) \ 128*4882a593SmuzhiyunCONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ 129*4882a593Smuzhiyun .if (x != 12); \ 130*4882a593Smuzhiyun lw T0, (x*4)(STATE); \ 131*4882a593Smuzhiyun .endif; \ 132*4882a593Smuzhiyun lwl T1, (x*4)+MSB ## (IN); \ 133*4882a593Smuzhiyun lwr T1, (x*4)+LSB ## (IN); \ 134*4882a593Smuzhiyun .if (x == 12); \ 135*4882a593Smuzhiyun addu X ## x, NONCE_0; \ 136*4882a593Smuzhiyun .else; \ 137*4882a593Smuzhiyun addu X ## x, T0; \ 138*4882a593Smuzhiyun .endif; \ 139*4882a593Smuzhiyun CPU_TO_LE32(X ## x); \ 140*4882a593Smuzhiyun xor X ## x, T1; \ 141*4882a593Smuzhiyun swl X ## x, (x*4)+MSB ## (OUT); \ 142*4882a593Smuzhiyun swr X ## x, (x*4)+LSB ## (OUT); 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun#define STORE_ALIGNED(x) \ 145*4882a593SmuzhiyunCONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ 146*4882a593Smuzhiyun .if (x != 12); \ 147*4882a593Smuzhiyun lw T0, (x*4)(STATE); \ 148*4882a593Smuzhiyun .endif; \ 149*4882a593Smuzhiyun lw T1, (x*4) ## (IN); \ 150*4882a593Smuzhiyun .if (x == 12); \ 151*4882a593Smuzhiyun addu X ## x, NONCE_0; \ 152*4882a593Smuzhiyun .else; \ 153*4882a593Smuzhiyun addu X ## x, T0; \ 154*4882a593Smuzhiyun .endif; \ 155*4882a593Smuzhiyun CPU_TO_LE32(X ## x); \ 156*4882a593Smuzhiyun xor X ## x, T1; \ 157*4882a593Smuzhiyun sw X ## x, (x*4) ## (OUT); 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun/* Jump table macro. 160*4882a593Smuzhiyun * Used for setup and handling the last bytes, which are not multiple of 4. 161*4882a593Smuzhiyun * X15 is free to store Xn 162*4882a593Smuzhiyun * Every jumptable entry must be equal in size. 163*4882a593Smuzhiyun */ 164*4882a593Smuzhiyun#define JMPTBL_ALIGNED(x) \ 165*4882a593Smuzhiyun.Lchacha_mips_jmptbl_aligned_ ## x: ; \ 166*4882a593Smuzhiyun .set noreorder; \ 167*4882a593Smuzhiyun b .Lchacha_mips_xor_aligned_ ## x ## _b; \ 168*4882a593Smuzhiyun .if (x == 12); \ 169*4882a593Smuzhiyun addu SAVED_X, X ## x, NONCE_0; \ 170*4882a593Smuzhiyun .else; \ 171*4882a593Smuzhiyun addu SAVED_X, X ## x, SAVED_CA; \ 172*4882a593Smuzhiyun .endif; \ 173*4882a593Smuzhiyun .set reorder 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun#define JMPTBL_UNALIGNED(x) \ 176*4882a593Smuzhiyun.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ 177*4882a593Smuzhiyun .set noreorder; \ 178*4882a593Smuzhiyun b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ 179*4882a593Smuzhiyun .if (x == 12); \ 180*4882a593Smuzhiyun addu SAVED_X, X ## x, NONCE_0; \ 181*4882a593Smuzhiyun .else; \ 182*4882a593Smuzhiyun addu SAVED_X, X ## x, SAVED_CA; \ 183*4882a593Smuzhiyun .endif; \ 184*4882a593Smuzhiyun .set reorder 185*4882a593Smuzhiyun 186*4882a593Smuzhiyun#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ 187*4882a593Smuzhiyun addu X(A), X(K); \ 188*4882a593Smuzhiyun addu X(B), X(L); \ 189*4882a593Smuzhiyun addu X(C), X(M); \ 190*4882a593Smuzhiyun addu X(D), X(N); \ 191*4882a593Smuzhiyun xor X(V), X(A); \ 192*4882a593Smuzhiyun xor X(W), X(B); \ 193*4882a593Smuzhiyun xor X(Y), X(C); \ 194*4882a593Smuzhiyun xor X(Z), X(D); \ 195*4882a593Smuzhiyun rotl X(V), S; \ 196*4882a593Smuzhiyun rotl X(W), S; \ 197*4882a593Smuzhiyun rotl X(Y), S; \ 198*4882a593Smuzhiyun rotl X(Z), S; 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun.text 201*4882a593Smuzhiyun.set reorder 202*4882a593Smuzhiyun.set noat 203*4882a593Smuzhiyun.globl chacha_crypt_arch 204*4882a593Smuzhiyun.ent chacha_crypt_arch 205*4882a593Smuzhiyunchacha_crypt_arch: 206*4882a593Smuzhiyun .frame $sp, STACK_SIZE, $ra 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun /* Load number of rounds */ 209*4882a593Smuzhiyun lw $at, 16($sp) 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun addiu $sp, -STACK_SIZE 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun /* Return bytes = 0. */ 214*4882a593Smuzhiyun beqz BYTES, .Lchacha_mips_end 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun lw NONCE_0, 48(STATE) 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun /* Save s0-s7 */ 219*4882a593Smuzhiyun sw $s0, 0($sp) 220*4882a593Smuzhiyun sw $s1, 4($sp) 221*4882a593Smuzhiyun sw $s2, 8($sp) 222*4882a593Smuzhiyun sw $s3, 12($sp) 223*4882a593Smuzhiyun sw $s4, 16($sp) 224*4882a593Smuzhiyun sw $s5, 20($sp) 225*4882a593Smuzhiyun sw $s6, 24($sp) 226*4882a593Smuzhiyun sw $s7, 28($sp) 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun /* Test IN or OUT is unaligned. 229*4882a593Smuzhiyun * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 230*4882a593Smuzhiyun */ 231*4882a593Smuzhiyun or IS_UNALIGNED, IN, OUT 232*4882a593Smuzhiyun andi IS_UNALIGNED, 0x3 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun b .Lchacha_rounds_start 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun.align 4 237*4882a593Smuzhiyun.Loop_chacha_rounds: 238*4882a593Smuzhiyun addiu IN, CHACHA20_BLOCK_SIZE 239*4882a593Smuzhiyun addiu OUT, CHACHA20_BLOCK_SIZE 240*4882a593Smuzhiyun addiu NONCE_0, 1 241*4882a593Smuzhiyun 242*4882a593Smuzhiyun.Lchacha_rounds_start: 243*4882a593Smuzhiyun lw X0, 0(STATE) 244*4882a593Smuzhiyun lw X1, 4(STATE) 245*4882a593Smuzhiyun lw X2, 8(STATE) 246*4882a593Smuzhiyun lw X3, 12(STATE) 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun lw X4, 16(STATE) 249*4882a593Smuzhiyun lw X5, 20(STATE) 250*4882a593Smuzhiyun lw X6, 24(STATE) 251*4882a593Smuzhiyun lw X7, 28(STATE) 252*4882a593Smuzhiyun lw X8, 32(STATE) 253*4882a593Smuzhiyun lw X9, 36(STATE) 254*4882a593Smuzhiyun lw X10, 40(STATE) 255*4882a593Smuzhiyun lw X11, 44(STATE) 256*4882a593Smuzhiyun 257*4882a593Smuzhiyun move X12, NONCE_0 258*4882a593Smuzhiyun lw X13, 52(STATE) 259*4882a593Smuzhiyun lw X14, 56(STATE) 260*4882a593Smuzhiyun lw X15, 60(STATE) 261*4882a593Smuzhiyun 262*4882a593Smuzhiyun.Loop_chacha_xor_rounds: 263*4882a593Smuzhiyun addiu $at, -2 264*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 265*4882a593Smuzhiyun AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); 266*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); 267*4882a593Smuzhiyun AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); 268*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); 269*4882a593Smuzhiyun AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 270*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 271*4882a593Smuzhiyun AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 272*4882a593Smuzhiyun bnez $at, .Loop_chacha_xor_rounds 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun addiu BYTES, -(CHACHA20_BLOCK_SIZE) 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun /* Is data src/dst unaligned? Jump */ 277*4882a593Smuzhiyun bnez IS_UNALIGNED, .Loop_chacha_unaligned 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun /* Set number rounds here to fill delayslot. */ 280*4882a593Smuzhiyun lw $at, (STACK_SIZE+16)($sp) 281*4882a593Smuzhiyun 282*4882a593Smuzhiyun /* BYTES < 0, it has no full block. */ 283*4882a593Smuzhiyun bltz BYTES, .Lchacha_mips_no_full_block_aligned 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun FOR_EACH_WORD_REV(STORE_ALIGNED) 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun /* BYTES > 0? Loop again. */ 288*4882a593Smuzhiyun bgtz BYTES, .Loop_chacha_rounds 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun /* Place this here to fill delay slot */ 291*4882a593Smuzhiyun addiu NONCE_0, 1 292*4882a593Smuzhiyun 293*4882a593Smuzhiyun /* BYTES < 0? Handle last bytes */ 294*4882a593Smuzhiyun bltz BYTES, .Lchacha_mips_xor_bytes 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun.Lchacha_mips_xor_done: 297*4882a593Smuzhiyun /* Restore used registers */ 298*4882a593Smuzhiyun lw $s0, 0($sp) 299*4882a593Smuzhiyun lw $s1, 4($sp) 300*4882a593Smuzhiyun lw $s2, 8($sp) 301*4882a593Smuzhiyun lw $s3, 12($sp) 302*4882a593Smuzhiyun lw $s4, 16($sp) 303*4882a593Smuzhiyun lw $s5, 20($sp) 304*4882a593Smuzhiyun lw $s6, 24($sp) 305*4882a593Smuzhiyun lw $s7, 28($sp) 306*4882a593Smuzhiyun 307*4882a593Smuzhiyun /* Write NONCE_0 back to right location in state */ 308*4882a593Smuzhiyun sw NONCE_0, 48(STATE) 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun.Lchacha_mips_end: 311*4882a593Smuzhiyun addiu $sp, STACK_SIZE 312*4882a593Smuzhiyun jr $ra 313*4882a593Smuzhiyun 314*4882a593Smuzhiyun.Lchacha_mips_no_full_block_aligned: 315*4882a593Smuzhiyun /* Restore the offset on BYTES */ 316*4882a593Smuzhiyun addiu BYTES, CHACHA20_BLOCK_SIZE 317*4882a593Smuzhiyun 318*4882a593Smuzhiyun /* Get number of full WORDS */ 319*4882a593Smuzhiyun andi $at, BYTES, MASK_U32 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun /* Load upper half of jump table addr */ 322*4882a593Smuzhiyun lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) 323*4882a593Smuzhiyun 324*4882a593Smuzhiyun /* Calculate lower half jump table offset */ 325*4882a593Smuzhiyun ins T0, $at, 1, 6 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun /* Add offset to STATE */ 328*4882a593Smuzhiyun addu T1, STATE, $at 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun /* Add lower half jump table addr */ 331*4882a593Smuzhiyun addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) 332*4882a593Smuzhiyun 333*4882a593Smuzhiyun /* Read value from STATE */ 334*4882a593Smuzhiyun lw SAVED_CA, 0(T1) 335*4882a593Smuzhiyun 336*4882a593Smuzhiyun /* Store remaining bytecounter as negative value */ 337*4882a593Smuzhiyun subu BYTES, $at, BYTES 338*4882a593Smuzhiyun 339*4882a593Smuzhiyun jr T0 340*4882a593Smuzhiyun 341*4882a593Smuzhiyun /* Jump table */ 342*4882a593Smuzhiyun FOR_EACH_WORD(JMPTBL_ALIGNED) 343*4882a593Smuzhiyun 344*4882a593Smuzhiyun 345*4882a593Smuzhiyun.Loop_chacha_unaligned: 346*4882a593Smuzhiyun /* Set number rounds here to fill delayslot. */ 347*4882a593Smuzhiyun lw $at, (STACK_SIZE+16)($sp) 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun /* BYTES > 0, it has no full block. */ 350*4882a593Smuzhiyun bltz BYTES, .Lchacha_mips_no_full_block_unaligned 351*4882a593Smuzhiyun 352*4882a593Smuzhiyun FOR_EACH_WORD_REV(STORE_UNALIGNED) 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun /* BYTES > 0? Loop again. */ 355*4882a593Smuzhiyun bgtz BYTES, .Loop_chacha_rounds 356*4882a593Smuzhiyun 357*4882a593Smuzhiyun /* Write NONCE_0 back to right location in state */ 358*4882a593Smuzhiyun sw NONCE_0, 48(STATE) 359*4882a593Smuzhiyun 360*4882a593Smuzhiyun .set noreorder 361*4882a593Smuzhiyun /* Fall through to byte handling */ 362*4882a593Smuzhiyun bgez BYTES, .Lchacha_mips_xor_done 363*4882a593Smuzhiyun.Lchacha_mips_xor_unaligned_0_b: 364*4882a593Smuzhiyun.Lchacha_mips_xor_aligned_0_b: 365*4882a593Smuzhiyun /* Place this here to fill delay slot */ 366*4882a593Smuzhiyun addiu NONCE_0, 1 367*4882a593Smuzhiyun .set reorder 368*4882a593Smuzhiyun 369*4882a593Smuzhiyun.Lchacha_mips_xor_bytes: 370*4882a593Smuzhiyun addu IN, $at 371*4882a593Smuzhiyun addu OUT, $at 372*4882a593Smuzhiyun /* First byte */ 373*4882a593Smuzhiyun lbu T1, 0(IN) 374*4882a593Smuzhiyun addiu $at, BYTES, 1 375*4882a593Smuzhiyun CPU_TO_LE32(SAVED_X) 376*4882a593Smuzhiyun ROTR(SAVED_X) 377*4882a593Smuzhiyun xor T1, SAVED_X 378*4882a593Smuzhiyun sb T1, 0(OUT) 379*4882a593Smuzhiyun beqz $at, .Lchacha_mips_xor_done 380*4882a593Smuzhiyun /* Second byte */ 381*4882a593Smuzhiyun lbu T1, 1(IN) 382*4882a593Smuzhiyun addiu $at, BYTES, 2 383*4882a593Smuzhiyun ROTx SAVED_X, 8 384*4882a593Smuzhiyun xor T1, SAVED_X 385*4882a593Smuzhiyun sb T1, 1(OUT) 386*4882a593Smuzhiyun beqz $at, .Lchacha_mips_xor_done 387*4882a593Smuzhiyun /* Third byte */ 388*4882a593Smuzhiyun lbu T1, 2(IN) 389*4882a593Smuzhiyun ROTx SAVED_X, 8 390*4882a593Smuzhiyun xor T1, SAVED_X 391*4882a593Smuzhiyun sb T1, 2(OUT) 392*4882a593Smuzhiyun b .Lchacha_mips_xor_done 393*4882a593Smuzhiyun 394*4882a593Smuzhiyun.Lchacha_mips_no_full_block_unaligned: 395*4882a593Smuzhiyun /* Restore the offset on BYTES */ 396*4882a593Smuzhiyun addiu BYTES, CHACHA20_BLOCK_SIZE 397*4882a593Smuzhiyun 398*4882a593Smuzhiyun /* Get number of full WORDS */ 399*4882a593Smuzhiyun andi $at, BYTES, MASK_U32 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun /* Load upper half of jump table addr */ 402*4882a593Smuzhiyun lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun /* Calculate lower half jump table offset */ 405*4882a593Smuzhiyun ins T0, $at, 1, 6 406*4882a593Smuzhiyun 407*4882a593Smuzhiyun /* Add offset to STATE */ 408*4882a593Smuzhiyun addu T1, STATE, $at 409*4882a593Smuzhiyun 410*4882a593Smuzhiyun /* Add lower half jump table addr */ 411*4882a593Smuzhiyun addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun /* Read value from STATE */ 414*4882a593Smuzhiyun lw SAVED_CA, 0(T1) 415*4882a593Smuzhiyun 416*4882a593Smuzhiyun /* Store remaining bytecounter as negative value */ 417*4882a593Smuzhiyun subu BYTES, $at, BYTES 418*4882a593Smuzhiyun 419*4882a593Smuzhiyun jr T0 420*4882a593Smuzhiyun 421*4882a593Smuzhiyun /* Jump table */ 422*4882a593Smuzhiyun FOR_EACH_WORD(JMPTBL_UNALIGNED) 423*4882a593Smuzhiyun.end chacha_crypt_arch 424*4882a593Smuzhiyun.set at 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun/* Input arguments 427*4882a593Smuzhiyun * STATE $a0 428*4882a593Smuzhiyun * OUT $a1 429*4882a593Smuzhiyun * NROUND $a2 430*4882a593Smuzhiyun */ 431*4882a593Smuzhiyun 432*4882a593Smuzhiyun#undef X12 433*4882a593Smuzhiyun#undef X13 434*4882a593Smuzhiyun#undef X14 435*4882a593Smuzhiyun#undef X15 436*4882a593Smuzhiyun 437*4882a593Smuzhiyun#define X12 $a3 438*4882a593Smuzhiyun#define X13 $at 439*4882a593Smuzhiyun#define X14 $v0 440*4882a593Smuzhiyun#define X15 STATE 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun.set noat 443*4882a593Smuzhiyun.globl hchacha_block_arch 444*4882a593Smuzhiyun.ent hchacha_block_arch 445*4882a593Smuzhiyunhchacha_block_arch: 446*4882a593Smuzhiyun .frame $sp, STACK_SIZE, $ra 447*4882a593Smuzhiyun 448*4882a593Smuzhiyun addiu $sp, -STACK_SIZE 449*4882a593Smuzhiyun 450*4882a593Smuzhiyun /* Save X11(s6) */ 451*4882a593Smuzhiyun sw X11, 0($sp) 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun lw X0, 0(STATE) 454*4882a593Smuzhiyun lw X1, 4(STATE) 455*4882a593Smuzhiyun lw X2, 8(STATE) 456*4882a593Smuzhiyun lw X3, 12(STATE) 457*4882a593Smuzhiyun lw X4, 16(STATE) 458*4882a593Smuzhiyun lw X5, 20(STATE) 459*4882a593Smuzhiyun lw X6, 24(STATE) 460*4882a593Smuzhiyun lw X7, 28(STATE) 461*4882a593Smuzhiyun lw X8, 32(STATE) 462*4882a593Smuzhiyun lw X9, 36(STATE) 463*4882a593Smuzhiyun lw X10, 40(STATE) 464*4882a593Smuzhiyun lw X11, 44(STATE) 465*4882a593Smuzhiyun lw X12, 48(STATE) 466*4882a593Smuzhiyun lw X13, 52(STATE) 467*4882a593Smuzhiyun lw X14, 56(STATE) 468*4882a593Smuzhiyun lw X15, 60(STATE) 469*4882a593Smuzhiyun 470*4882a593Smuzhiyun.Loop_hchacha_xor_rounds: 471*4882a593Smuzhiyun addiu $a2, -2 472*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 473*4882a593Smuzhiyun AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); 474*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); 475*4882a593Smuzhiyun AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); 476*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); 477*4882a593Smuzhiyun AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 478*4882a593Smuzhiyun AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 479*4882a593Smuzhiyun AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 480*4882a593Smuzhiyun bnez $a2, .Loop_hchacha_xor_rounds 481*4882a593Smuzhiyun 482*4882a593Smuzhiyun /* Restore used register */ 483*4882a593Smuzhiyun lw X11, 0($sp) 484*4882a593Smuzhiyun 485*4882a593Smuzhiyun sw X0, 0(OUT) 486*4882a593Smuzhiyun sw X1, 4(OUT) 487*4882a593Smuzhiyun sw X2, 8(OUT) 488*4882a593Smuzhiyun sw X3, 12(OUT) 489*4882a593Smuzhiyun sw X12, 16(OUT) 490*4882a593Smuzhiyun sw X13, 20(OUT) 491*4882a593Smuzhiyun sw X14, 24(OUT) 492*4882a593Smuzhiyun sw X15, 28(OUT) 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun addiu $sp, STACK_SIZE 495*4882a593Smuzhiyun jr $ra 496*4882a593Smuzhiyun.end hchacha_block_arch 497*4882a593Smuzhiyun.set at 498