1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * NH - ε-almost-universal hash function, ARM64 NEON accelerated version 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright 2018 Google LLC 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com> 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#include <linux/linkage.h> 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun KEY .req x0 13*4882a593Smuzhiyun MESSAGE .req x1 14*4882a593Smuzhiyun MESSAGE_LEN .req x2 15*4882a593Smuzhiyun HASH .req x3 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun PASS0_SUMS .req v0 18*4882a593Smuzhiyun PASS1_SUMS .req v1 19*4882a593Smuzhiyun PASS2_SUMS .req v2 20*4882a593Smuzhiyun PASS3_SUMS .req v3 21*4882a593Smuzhiyun K0 .req v4 22*4882a593Smuzhiyun K1 .req v5 23*4882a593Smuzhiyun K2 .req v6 24*4882a593Smuzhiyun K3 .req v7 25*4882a593Smuzhiyun T0 .req v8 26*4882a593Smuzhiyun T1 .req v9 27*4882a593Smuzhiyun T2 .req v10 28*4882a593Smuzhiyun T3 .req v11 29*4882a593Smuzhiyun T4 .req v12 30*4882a593Smuzhiyun T5 .req v13 31*4882a593Smuzhiyun T6 .req v14 32*4882a593Smuzhiyun T7 .req v15 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun.macro _nh_stride k0, k1, k2, k3 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun // Load next message stride 37*4882a593Smuzhiyun ld1 {T3.16b}, [MESSAGE], #16 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun // Load next key stride 40*4882a593Smuzhiyun ld1 {\k3\().4s}, [KEY], #16 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun // Add message words to key words 43*4882a593Smuzhiyun add T0.4s, T3.4s, \k0\().4s 44*4882a593Smuzhiyun add T1.4s, T3.4s, \k1\().4s 45*4882a593Smuzhiyun add T2.4s, T3.4s, \k2\().4s 46*4882a593Smuzhiyun add T3.4s, T3.4s, \k3\().4s 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun // Multiply 32x32 => 64 and accumulate 49*4882a593Smuzhiyun mov T4.d[0], T0.d[1] 50*4882a593Smuzhiyun mov T5.d[0], T1.d[1] 51*4882a593Smuzhiyun mov T6.d[0], T2.d[1] 52*4882a593Smuzhiyun mov T7.d[0], T3.d[1] 53*4882a593Smuzhiyun umlal PASS0_SUMS.2d, T0.2s, T4.2s 54*4882a593Smuzhiyun umlal PASS1_SUMS.2d, T1.2s, T5.2s 55*4882a593Smuzhiyun umlal PASS2_SUMS.2d, T2.2s, T6.2s 56*4882a593Smuzhiyun umlal PASS3_SUMS.2d, T3.2s, T7.2s 57*4882a593Smuzhiyun.endm 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun/* 60*4882a593Smuzhiyun * void nh_neon(const u32 *key, const u8 *message, size_t message_len, 61*4882a593Smuzhiyun * u8 hash[NH_HASH_BYTES]) 62*4882a593Smuzhiyun * 63*4882a593Smuzhiyun * It's guaranteed that message_len % 16 == 0. 64*4882a593Smuzhiyun */ 65*4882a593SmuzhiyunSYM_FUNC_START(nh_neon) 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun ld1 {K0.4s,K1.4s}, [KEY], #32 68*4882a593Smuzhiyun movi PASS0_SUMS.2d, #0 69*4882a593Smuzhiyun movi PASS1_SUMS.2d, #0 70*4882a593Smuzhiyun ld1 {K2.4s}, [KEY], #16 71*4882a593Smuzhiyun movi PASS2_SUMS.2d, #0 72*4882a593Smuzhiyun movi PASS3_SUMS.2d, #0 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #64 75*4882a593Smuzhiyun blt .Lloop4_done 76*4882a593Smuzhiyun.Lloop4: 77*4882a593Smuzhiyun _nh_stride K0, K1, K2, K3 78*4882a593Smuzhiyun _nh_stride K1, K2, K3, K0 79*4882a593Smuzhiyun _nh_stride K2, K3, K0, K1 80*4882a593Smuzhiyun _nh_stride K3, K0, K1, K2 81*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #64 82*4882a593Smuzhiyun bge .Lloop4 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun.Lloop4_done: 85*4882a593Smuzhiyun ands MESSAGE_LEN, MESSAGE_LEN, #63 86*4882a593Smuzhiyun beq .Ldone 87*4882a593Smuzhiyun _nh_stride K0, K1, K2, K3 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #16 90*4882a593Smuzhiyun beq .Ldone 91*4882a593Smuzhiyun _nh_stride K1, K2, K3, K0 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun subs MESSAGE_LEN, MESSAGE_LEN, #16 94*4882a593Smuzhiyun beq .Ldone 95*4882a593Smuzhiyun _nh_stride K2, K3, K0, K1 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun.Ldone: 98*4882a593Smuzhiyun // Sum the accumulators for each pass, then store the sums to 'hash' 99*4882a593Smuzhiyun addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d 100*4882a593Smuzhiyun addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d 101*4882a593Smuzhiyun st1 {T0.16b,T1.16b}, [HASH] 102*4882a593Smuzhiyun ret 103*4882a593SmuzhiyunSYM_FUNC_END(nh_neon) 104