1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * NH - ε-almost-universal hash function, NEON accelerated version 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright 2018 Google LLC 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com> 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#include <linux/linkage.h> 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun .text 13*4882a593Smuzhiyun .fpu neon 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun KEY .req r0 16*4882a593Smuzhiyun MESSAGE .req r1 17*4882a593Smuzhiyun MESSAGE_LEN .req r2 18*4882a593Smuzhiyun HASH .req r3 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun PASS0_SUMS .req q0 21*4882a593Smuzhiyun PASS0_SUM_A .req d0 22*4882a593Smuzhiyun PASS0_SUM_B .req d1 23*4882a593Smuzhiyun PASS1_SUMS .req q1 24*4882a593Smuzhiyun PASS1_SUM_A .req d2 25*4882a593Smuzhiyun PASS1_SUM_B .req d3 26*4882a593Smuzhiyun PASS2_SUMS .req q2 27*4882a593Smuzhiyun PASS2_SUM_A .req d4 28*4882a593Smuzhiyun PASS2_SUM_B .req d5 29*4882a593Smuzhiyun PASS3_SUMS .req q3 30*4882a593Smuzhiyun PASS3_SUM_A .req d6 31*4882a593Smuzhiyun PASS3_SUM_B .req d7 32*4882a593Smuzhiyun K0 .req q4 33*4882a593Smuzhiyun K1 .req q5 34*4882a593Smuzhiyun K2 .req q6 35*4882a593Smuzhiyun K3 .req q7 36*4882a593Smuzhiyun T0 .req q8 37*4882a593Smuzhiyun T0_L .req d16 38*4882a593Smuzhiyun T0_H .req d17 39*4882a593Smuzhiyun T1 .req q9 40*4882a593Smuzhiyun T1_L .req d18 41*4882a593Smuzhiyun T1_H .req d19 42*4882a593Smuzhiyun T2 .req q10 43*4882a593Smuzhiyun T2_L .req d20 44*4882a593Smuzhiyun T2_H .req d21 45*4882a593Smuzhiyun T3 .req q11 46*4882a593Smuzhiyun T3_L .req d22 47*4882a593Smuzhiyun T3_H .req d23 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun.macro _nh_stride k0, k1, k2, k3 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun // Load next message stride 52*4882a593Smuzhiyun vld1.8 {T3}, [MESSAGE]! 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun // Load next key stride 55*4882a593Smuzhiyun vld1.32 {\k3}, [KEY]! 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun // Add message words to key words 58*4882a593Smuzhiyun vadd.u32 T0, T3, \k0 59*4882a593Smuzhiyun vadd.u32 T1, T3, \k1 60*4882a593Smuzhiyun vadd.u32 T2, T3, \k2 61*4882a593Smuzhiyun vadd.u32 T3, T3, \k3 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun // Multiply 32x32 => 64 and accumulate 64*4882a593Smuzhiyun vmlal.u32 PASS0_SUMS, T0_L, T0_H 65*4882a593Smuzhiyun vmlal.u32 PASS1_SUMS, T1_L, T1_H 66*4882a593Smuzhiyun vmlal.u32 PASS2_SUMS, T2_L, T2_H 67*4882a593Smuzhiyun vmlal.u32 PASS3_SUMS, T3_L, T3_H 68*4882a593Smuzhiyun.endm 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun/* 71*4882a593Smuzhiyun * void nh_neon(const u32 *key, const u8 *message, size_t message_len, 72*4882a593Smuzhiyun * u8 hash[NH_HASH_BYTES]) 73*4882a593Smuzhiyun * 74*4882a593Smuzhiyun * It's guaranteed that message_len % 16 == 0. 75*4882a593Smuzhiyun */ 76*4882a593SmuzhiyunENTRY(nh_neon) 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun vld1.32 {K0,K1}, [KEY]! 79*4882a593Smuzhiyun vmov.u64 PASS0_SUMS, #0 80*4882a593Smuzhiyun vmov.u64 PASS1_SUMS, #0 81*4882a593Smuzhiyun vld1.32 {K2}, [KEY]! 82*4882a593Smuzhiyun vmov.u64 PASS2_SUMS, #0 83*4882a593Smuzhiyun vmov.u64 PASS3_SUMS, #0 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #64 86*4882a593Smuzhiyun blt .Lloop4_done 87*4882a593Smuzhiyun.Lloop4: 88*4882a593Smuzhiyun _nh_stride K0, K1, K2, K3 89*4882a593Smuzhiyun _nh_stride K1, K2, K3, K0 90*4882a593Smuzhiyun _nh_stride K2, K3, K0, K1 91*4882a593Smuzhiyun _nh_stride K3, K0, K1, K2 92*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #64 93*4882a593Smuzhiyun bge .Lloop4 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun.Lloop4_done: 96*4882a593Smuzhiyun ands MESSAGE_LEN, MESSAGE_LEN, #63 97*4882a593Smuzhiyun beq .Ldone 98*4882a593Smuzhiyun _nh_stride K0, K1, K2, K3 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #16 101*4882a593Smuzhiyun beq .Ldone 102*4882a593Smuzhiyun _nh_stride K1, K2, K3, K0 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #16 105*4882a593Smuzhiyun beq .Ldone 106*4882a593Smuzhiyun _nh_stride K2, K3, K0, K1 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun.Ldone: 109*4882a593Smuzhiyun // Sum the accumulators for each pass, then store the sums to 'hash' 110*4882a593Smuzhiyun vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B 111*4882a593Smuzhiyun vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B 112*4882a593Smuzhiyun vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B 113*4882a593Smuzhiyun vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B 114*4882a593Smuzhiyun vst1.8 {T0-T1}, [HASH] 115*4882a593Smuzhiyun bx lr 116*4882a593SmuzhiyunENDPROC(nh_neon) 117