1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * BLAKE2s digest algorithm, ARM scalar implementation 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright 2020 Google LLC 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com> 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#include <linux/linkage.h> 11*4882a593Smuzhiyun#include <asm/assembler.h> 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun // Registers used to hold message words temporarily. There aren't 14*4882a593Smuzhiyun // enough ARM registers to hold the whole message block, so we have to 15*4882a593Smuzhiyun // load the words on-demand. 16*4882a593Smuzhiyun M_0 .req r12 17*4882a593Smuzhiyun M_1 .req r14 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun// The BLAKE2s initialization vector 20*4882a593Smuzhiyun.Lblake2s_IV: 21*4882a593Smuzhiyun .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A 22*4882a593Smuzhiyun .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun.macro __ldrd a, b, src, offset 25*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 6 26*4882a593Smuzhiyun ldrd \a, \b, [\src, #\offset] 27*4882a593Smuzhiyun#else 28*4882a593Smuzhiyun ldr \a, [\src, #\offset] 29*4882a593Smuzhiyun ldr \b, [\src, #\offset + 4] 30*4882a593Smuzhiyun#endif 31*4882a593Smuzhiyun.endm 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun.macro __strd a, b, dst, offset 34*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 6 35*4882a593Smuzhiyun strd \a, \b, [\dst, #\offset] 36*4882a593Smuzhiyun#else 37*4882a593Smuzhiyun str \a, [\dst, #\offset] 38*4882a593Smuzhiyun str \b, [\dst, #\offset + 4] 39*4882a593Smuzhiyun#endif 40*4882a593Smuzhiyun.endm 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun.macro _le32_bswap a, tmp 43*4882a593Smuzhiyun#ifdef __ARMEB__ 44*4882a593Smuzhiyun rev_l \a, \tmp 45*4882a593Smuzhiyun#endif 46*4882a593Smuzhiyun.endm 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp 49*4882a593Smuzhiyun _le32_bswap \a, \tmp 50*4882a593Smuzhiyun _le32_bswap \b, \tmp 51*4882a593Smuzhiyun _le32_bswap \c, \tmp 52*4882a593Smuzhiyun _le32_bswap \d, \tmp 53*4882a593Smuzhiyun _le32_bswap \e, \tmp 54*4882a593Smuzhiyun _le32_bswap \f, \tmp 55*4882a593Smuzhiyun _le32_bswap \g, \tmp 56*4882a593Smuzhiyun _le32_bswap \h, \tmp 57*4882a593Smuzhiyun.endm 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. 60*4882a593Smuzhiyun// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two 61*4882a593Smuzhiyun// columns/diagonals. s0-s1 are the word offsets to the message words the first 62*4882a593Smuzhiyun// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. 63*4882a593Smuzhiyun// M_0 and M_1 are free to use, and the message block can be found at sp + 32. 64*4882a593Smuzhiyun// 65*4882a593Smuzhiyun// Note that to save instructions, the rotations don't happen when the 66*4882a593Smuzhiyun// pseudocode says they should, but rather they are delayed until the values are 67*4882a593Smuzhiyun// used. See the comment above _blake2s_round(). 68*4882a593Smuzhiyun.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun ldr M_0, [sp, #32 + 4 * \s0] 71*4882a593Smuzhiyun ldr M_1, [sp, #32 + 4 * \s2] 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun // a += b + m[blake2s_sigma[r][2*i + 0]]; 74*4882a593Smuzhiyun add \a0, \a0, \b0, ror #brot 75*4882a593Smuzhiyun add \a1, \a1, \b1, ror #brot 76*4882a593Smuzhiyun add \a0, \a0, M_0 77*4882a593Smuzhiyun add \a1, \a1, M_1 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun // d = ror32(d ^ a, 16); 80*4882a593Smuzhiyun eor \d0, \a0, \d0, ror #drot 81*4882a593Smuzhiyun eor \d1, \a1, \d1, ror #drot 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun // c += d; 84*4882a593Smuzhiyun add \c0, \c0, \d0, ror #16 85*4882a593Smuzhiyun add \c1, \c1, \d1, ror #16 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun // b = ror32(b ^ c, 12); 88*4882a593Smuzhiyun eor \b0, \c0, \b0, ror #brot 89*4882a593Smuzhiyun eor \b1, \c1, \b1, ror #brot 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun ldr M_0, [sp, #32 + 4 * \s1] 92*4882a593Smuzhiyun ldr M_1, [sp, #32 + 4 * \s3] 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun // a += b + m[blake2s_sigma[r][2*i + 1]]; 95*4882a593Smuzhiyun add \a0, \a0, \b0, ror #12 96*4882a593Smuzhiyun add \a1, \a1, \b1, ror #12 97*4882a593Smuzhiyun add \a0, \a0, M_0 98*4882a593Smuzhiyun add \a1, \a1, M_1 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun // d = ror32(d ^ a, 8); 101*4882a593Smuzhiyun eor \d0, \a0, \d0, ror#16 102*4882a593Smuzhiyun eor \d1, \a1, \d1, ror#16 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun // c += d; 105*4882a593Smuzhiyun add \c0, \c0, \d0, ror#8 106*4882a593Smuzhiyun add \c1, \c1, \d1, ror#8 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun // b = ror32(b ^ c, 7); 109*4882a593Smuzhiyun eor \b0, \c0, \b0, ror#12 110*4882a593Smuzhiyun eor \b1, \c1, \b1, ror#12 111*4882a593Smuzhiyun.endm 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] 114*4882a593Smuzhiyun// are in r0..r9. The stack pointer points to 8 bytes of scratch space for 115*4882a593Smuzhiyun// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and 116*4882a593Smuzhiyun// r14 are free to use. The macro arguments s0-s15 give the order in which the 117*4882a593Smuzhiyun// message words are used in this round. 118*4882a593Smuzhiyun// 119*4882a593Smuzhiyun// All rotates are performed using the implicit rotate operand accepted by the 120*4882a593Smuzhiyun// 'add' and 'eor' instructions. This is faster than using explicit rotate 121*4882a593Smuzhiyun// instructions. To make this work, we allow the values in the second and last 122*4882a593Smuzhiyun// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the 123*4882a593Smuzhiyun// wrong rotation amount. The rotation amount is then fixed up just in time 124*4882a593Smuzhiyun// when the values are used. 'brot' is the number of bits the values in row 'b' 125*4882a593Smuzhiyun// need to be rotated right to arrive at the correct values, and 'drot' 126*4882a593Smuzhiyun// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 127*4882a593Smuzhiyun// that they end up as (7, 8) after every round. 128*4882a593Smuzhiyun.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ 129*4882a593Smuzhiyun s8, s9, s10, s11, s12, s13, s14, s15 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun // Mix first two columns: 132*4882a593Smuzhiyun // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). 133*4882a593Smuzhiyun __ldrd r10, r11, sp, 16 // load v[12] and v[13] 134*4882a593Smuzhiyun _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ 135*4882a593Smuzhiyun \s0, \s1, \s2, \s3 136*4882a593Smuzhiyun __strd r8, r9, sp, 0 137*4882a593Smuzhiyun __strd r10, r11, sp, 16 138*4882a593Smuzhiyun 139*4882a593Smuzhiyun // Mix second two columns: 140*4882a593Smuzhiyun // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). 141*4882a593Smuzhiyun __ldrd r8, r9, sp, 8 // load v[10] and v[11] 142*4882a593Smuzhiyun __ldrd r10, r11, sp, 24 // load v[14] and v[15] 143*4882a593Smuzhiyun _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ 144*4882a593Smuzhiyun \s4, \s5, \s6, \s7 145*4882a593Smuzhiyun str r10, [sp, #24] // store v[14] 146*4882a593Smuzhiyun // v[10], v[11], and v[15] are used below, so no need to store them yet. 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun .set brot, 7 149*4882a593Smuzhiyun .set drot, 8 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun // Mix first two diagonals: 152*4882a593Smuzhiyun // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). 153*4882a593Smuzhiyun ldr r10, [sp, #16] // load v[12] 154*4882a593Smuzhiyun _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ 155*4882a593Smuzhiyun \s8, \s9, \s10, \s11 156*4882a593Smuzhiyun __strd r8, r9, sp, 8 157*4882a593Smuzhiyun str r11, [sp, #28] 158*4882a593Smuzhiyun str r10, [sp, #16] 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun // Mix second two diagonals: 161*4882a593Smuzhiyun // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). 162*4882a593Smuzhiyun __ldrd r8, r9, sp, 0 // load v[8] and v[9] 163*4882a593Smuzhiyun __ldrd r10, r11, sp, 20 // load v[13] and v[14] 164*4882a593Smuzhiyun _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ 165*4882a593Smuzhiyun \s12, \s13, \s14, \s15 166*4882a593Smuzhiyun __strd r10, r11, sp, 20 167*4882a593Smuzhiyun.endm 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun// 170*4882a593Smuzhiyun// void blake2s_compress(struct blake2s_state *state, 171*4882a593Smuzhiyun// const u8 *block, size_t nblocks, u32 inc); 172*4882a593Smuzhiyun// 173*4882a593Smuzhiyun// Only the first three fields of struct blake2s_state are used: 174*4882a593Smuzhiyun// u32 h[8]; (inout) 175*4882a593Smuzhiyun// u32 t[2]; (inout) 176*4882a593Smuzhiyun// u32 f[2]; (in) 177*4882a593Smuzhiyun// 178*4882a593Smuzhiyun .align 5 179*4882a593SmuzhiyunENTRY(blake2s_compress) 180*4882a593Smuzhiyun push {r0-r2,r4-r11,lr} // keep this an even number 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun.Lnext_block: 183*4882a593Smuzhiyun // r0 is 'state' 184*4882a593Smuzhiyun // r1 is 'block' 185*4882a593Smuzhiyun // r3 is 'inc' 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun // Load and increment the counter t[0..1]. 188*4882a593Smuzhiyun __ldrd r10, r11, r0, 32 189*4882a593Smuzhiyun adds r10, r10, r3 190*4882a593Smuzhiyun adc r11, r11, #0 191*4882a593Smuzhiyun __strd r10, r11, r0, 32 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun // _blake2s_round is very short on registers, so copy the message block 194*4882a593Smuzhiyun // to the stack to save a register during the rounds. This also has the 195*4882a593Smuzhiyun // advantage that misalignment only needs to be dealt with in one place. 196*4882a593Smuzhiyun sub sp, sp, #64 197*4882a593Smuzhiyun mov r12, sp 198*4882a593Smuzhiyun tst r1, #3 199*4882a593Smuzhiyun bne .Lcopy_block_misaligned 200*4882a593Smuzhiyun ldmia r1!, {r2-r9} 201*4882a593Smuzhiyun _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 202*4882a593Smuzhiyun stmia r12!, {r2-r9} 203*4882a593Smuzhiyun ldmia r1!, {r2-r9} 204*4882a593Smuzhiyun _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 205*4882a593Smuzhiyun stmia r12, {r2-r9} 206*4882a593Smuzhiyun.Lcopy_block_done: 207*4882a593Smuzhiyun str r1, [sp, #68] // Update message pointer 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space 210*4882a593Smuzhiyun // for spilling v[8..9]. Leave v[8..9] in r8-r9. 211*4882a593Smuzhiyun mov r14, r0 // r14 = state 212*4882a593Smuzhiyun adr r12, .Lblake2s_IV 213*4882a593Smuzhiyun ldmia r12!, {r8-r9} // load IV[0..1] 214*4882a593Smuzhiyun __ldrd r0, r1, r14, 40 // load f[0..1] 215*4882a593Smuzhiyun ldm r12, {r2-r7} // load IV[3..7] 216*4882a593Smuzhiyun eor r4, r4, r10 // v[12] = IV[4] ^ t[0] 217*4882a593Smuzhiyun eor r5, r5, r11 // v[13] = IV[5] ^ t[1] 218*4882a593Smuzhiyun eor r6, r6, r0 // v[14] = IV[6] ^ f[0] 219*4882a593Smuzhiyun eor r7, r7, r1 // v[15] = IV[7] ^ f[1] 220*4882a593Smuzhiyun push {r2-r7} // push v[9..15] 221*4882a593Smuzhiyun sub sp, sp, #8 // leave space for v[8..9] 222*4882a593Smuzhiyun 223*4882a593Smuzhiyun // Load h[0..7] == v[0..7]. 224*4882a593Smuzhiyun ldm r14, {r0-r7} 225*4882a593Smuzhiyun 226*4882a593Smuzhiyun // Execute the rounds. Each round is provided the order in which it 227*4882a593Smuzhiyun // needs to use the message words. 228*4882a593Smuzhiyun .set brot, 0 229*4882a593Smuzhiyun .set drot, 0 230*4882a593Smuzhiyun _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 231*4882a593Smuzhiyun _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 232*4882a593Smuzhiyun _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 233*4882a593Smuzhiyun _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 234*4882a593Smuzhiyun _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 235*4882a593Smuzhiyun _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 236*4882a593Smuzhiyun _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 237*4882a593Smuzhiyun _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 238*4882a593Smuzhiyun _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 239*4882a593Smuzhiyun _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 240*4882a593Smuzhiyun 241*4882a593Smuzhiyun // Fold the final state matrix into the hash chaining value: 242*4882a593Smuzhiyun // 243*4882a593Smuzhiyun // for (i = 0; i < 8; i++) 244*4882a593Smuzhiyun // h[i] ^= v[i] ^ v[i + 8]; 245*4882a593Smuzhiyun // 246*4882a593Smuzhiyun ldr r14, [sp, #96] // r14 = &h[0] 247*4882a593Smuzhiyun add sp, sp, #8 // v[8..9] are already loaded. 248*4882a593Smuzhiyun pop {r10-r11} // load v[10..11] 249*4882a593Smuzhiyun eor r0, r0, r8 250*4882a593Smuzhiyun eor r1, r1, r9 251*4882a593Smuzhiyun eor r2, r2, r10 252*4882a593Smuzhiyun eor r3, r3, r11 253*4882a593Smuzhiyun ldm r14, {r8-r11} // load h[0..3] 254*4882a593Smuzhiyun eor r0, r0, r8 255*4882a593Smuzhiyun eor r1, r1, r9 256*4882a593Smuzhiyun eor r2, r2, r10 257*4882a593Smuzhiyun eor r3, r3, r11 258*4882a593Smuzhiyun stmia r14!, {r0-r3} // store new h[0..3] 259*4882a593Smuzhiyun ldm r14, {r0-r3} // load old h[4..7] 260*4882a593Smuzhiyun pop {r8-r11} // load v[12..15] 261*4882a593Smuzhiyun eor r0, r0, r4, ror #brot 262*4882a593Smuzhiyun eor r1, r1, r5, ror #brot 263*4882a593Smuzhiyun eor r2, r2, r6, ror #brot 264*4882a593Smuzhiyun eor r3, r3, r7, ror #brot 265*4882a593Smuzhiyun eor r0, r0, r8, ror #drot 266*4882a593Smuzhiyun eor r1, r1, r9, ror #drot 267*4882a593Smuzhiyun eor r2, r2, r10, ror #drot 268*4882a593Smuzhiyun eor r3, r3, r11, ror #drot 269*4882a593Smuzhiyun add sp, sp, #64 // skip copy of message block 270*4882a593Smuzhiyun stm r14, {r0-r3} // store new h[4..7] 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun // Advance to the next block, if there is one. Note that if there are 273*4882a593Smuzhiyun // multiple blocks, then 'inc' (the counter increment amount) must be 274*4882a593Smuzhiyun // 64. So we can simply set it to 64 without re-loading it. 275*4882a593Smuzhiyun ldm sp, {r0, r1, r2} // load (state, block, nblocks) 276*4882a593Smuzhiyun mov r3, #64 // set 'inc' 277*4882a593Smuzhiyun subs r2, r2, #1 // nblocks-- 278*4882a593Smuzhiyun str r2, [sp, #8] 279*4882a593Smuzhiyun bne .Lnext_block // nblocks != 0? 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun pop {r0-r2,r4-r11,pc} 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun // The next message block (pointed to by r1) isn't 4-byte aligned, so it 284*4882a593Smuzhiyun // can't be loaded using ldmia. Copy it to the stack buffer (pointed to 285*4882a593Smuzhiyun // by r12) using an alternative method. r2-r9 are free to use. 286*4882a593Smuzhiyun.Lcopy_block_misaligned: 287*4882a593Smuzhiyun mov r2, #64 288*4882a593Smuzhiyun1: 289*4882a593Smuzhiyun#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 290*4882a593Smuzhiyun ldr r3, [r1], #4 291*4882a593Smuzhiyun _le32_bswap r3, r4 292*4882a593Smuzhiyun#else 293*4882a593Smuzhiyun ldrb r3, [r1, #0] 294*4882a593Smuzhiyun ldrb r4, [r1, #1] 295*4882a593Smuzhiyun ldrb r5, [r1, #2] 296*4882a593Smuzhiyun ldrb r6, [r1, #3] 297*4882a593Smuzhiyun add r1, r1, #4 298*4882a593Smuzhiyun orr r3, r3, r4, lsl #8 299*4882a593Smuzhiyun orr r3, r3, r5, lsl #16 300*4882a593Smuzhiyun orr r3, r3, r6, lsl #24 301*4882a593Smuzhiyun#endif 302*4882a593Smuzhiyun subs r2, r2, #4 303*4882a593Smuzhiyun str r3, [r12], #4 304*4882a593Smuzhiyun bne 1b 305*4882a593Smuzhiyun b .Lcopy_block_done 306*4882a593SmuzhiyunENDPROC(blake2s_compress) 307