1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/assembler.h> 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun .arch armv8-a 12*4882a593Smuzhiyun .fpu crypto-neon-fp-armv8 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun SHASH .req q0 15*4882a593Smuzhiyun T1 .req q1 16*4882a593Smuzhiyun XL .req q2 17*4882a593Smuzhiyun XM .req q3 18*4882a593Smuzhiyun XH .req q4 19*4882a593Smuzhiyun IN1 .req q4 20*4882a593Smuzhiyun 21*4882a593Smuzhiyun SHASH_L .req d0 22*4882a593Smuzhiyun SHASH_H .req d1 23*4882a593Smuzhiyun T1_L .req d2 24*4882a593Smuzhiyun T1_H .req d3 25*4882a593Smuzhiyun XL_L .req d4 26*4882a593Smuzhiyun XL_H .req d5 27*4882a593Smuzhiyun XM_L .req d6 28*4882a593Smuzhiyun XM_H .req d7 29*4882a593Smuzhiyun XH_L .req d8 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun t0l .req d10 32*4882a593Smuzhiyun t0h .req d11 33*4882a593Smuzhiyun t1l .req d12 34*4882a593Smuzhiyun t1h .req d13 35*4882a593Smuzhiyun t2l .req d14 36*4882a593Smuzhiyun t2h .req d15 37*4882a593Smuzhiyun t3l .req d16 38*4882a593Smuzhiyun t3h .req d17 39*4882a593Smuzhiyun t4l .req d18 40*4882a593Smuzhiyun t4h .req d19 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun t0q .req q5 43*4882a593Smuzhiyun t1q .req q6 44*4882a593Smuzhiyun t2q .req q7 45*4882a593Smuzhiyun t3q .req q8 46*4882a593Smuzhiyun t4q .req q9 47*4882a593Smuzhiyun T2 .req q9 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun s1l .req d20 50*4882a593Smuzhiyun s1h .req d21 51*4882a593Smuzhiyun s2l .req d22 52*4882a593Smuzhiyun s2h .req d23 53*4882a593Smuzhiyun s3l .req d24 54*4882a593Smuzhiyun s3h .req d25 55*4882a593Smuzhiyun s4l .req d26 56*4882a593Smuzhiyun s4h .req d27 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun MASK .req d28 59*4882a593Smuzhiyun SHASH2_p8 .req d28 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun k16 .req d29 62*4882a593Smuzhiyun k32 .req d30 63*4882a593Smuzhiyun k48 .req d31 64*4882a593Smuzhiyun SHASH2_p64 .req d31 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun HH .req q10 67*4882a593Smuzhiyun HH3 .req q11 68*4882a593Smuzhiyun HH4 .req q12 69*4882a593Smuzhiyun HH34 .req q13 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun HH_L .req d20 72*4882a593Smuzhiyun HH_H .req d21 73*4882a593Smuzhiyun HH3_L .req d22 74*4882a593Smuzhiyun HH3_H .req d23 75*4882a593Smuzhiyun HH4_L .req d24 76*4882a593Smuzhiyun HH4_H .req d25 77*4882a593Smuzhiyun HH34_L .req d26 78*4882a593Smuzhiyun HH34_H .req d27 79*4882a593Smuzhiyun SHASH2_H .req d29 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun XL2 .req q5 82*4882a593Smuzhiyun XM2 .req q6 83*4882a593Smuzhiyun XH2 .req q7 84*4882a593Smuzhiyun T3 .req q8 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun XL2_L .req d10 87*4882a593Smuzhiyun XL2_H .req d11 88*4882a593Smuzhiyun XM2_L .req d12 89*4882a593Smuzhiyun XM2_H .req d13 90*4882a593Smuzhiyun T3_L .req d16 91*4882a593Smuzhiyun T3_H .req d17 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun .text 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 96*4882a593Smuzhiyun vmull.p64 \rd, \rn, \rm 97*4882a593Smuzhiyun .endm 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun /* 100*4882a593Smuzhiyun * This implementation of 64x64 -> 128 bit polynomial multiplication 101*4882a593Smuzhiyun * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 102*4882a593Smuzhiyun * "Fast Software Polynomial Multiplication on ARM Processors Using 103*4882a593Smuzhiyun * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 104*4882a593Smuzhiyun * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 105*4882a593Smuzhiyun * 106*4882a593Smuzhiyun * It has been slightly tweaked for in-order performance, and to allow 107*4882a593Smuzhiyun * 'rq' to overlap with 'ad' or 'bd'. 108*4882a593Smuzhiyun */ 109*4882a593Smuzhiyun .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 110*4882a593Smuzhiyun vext.8 t0l, \ad, \ad, #1 @ A1 111*4882a593Smuzhiyun .ifc \b1, t4l 112*4882a593Smuzhiyun vext.8 t4l, \bd, \bd, #1 @ B1 113*4882a593Smuzhiyun .endif 114*4882a593Smuzhiyun vmull.p8 t0q, t0l, \bd @ F = A1*B 115*4882a593Smuzhiyun vext.8 t1l, \ad, \ad, #2 @ A2 116*4882a593Smuzhiyun vmull.p8 t4q, \ad, \b1 @ E = A*B1 117*4882a593Smuzhiyun .ifc \b2, t3l 118*4882a593Smuzhiyun vext.8 t3l, \bd, \bd, #2 @ B2 119*4882a593Smuzhiyun .endif 120*4882a593Smuzhiyun vmull.p8 t1q, t1l, \bd @ H = A2*B 121*4882a593Smuzhiyun vext.8 t2l, \ad, \ad, #3 @ A3 122*4882a593Smuzhiyun vmull.p8 t3q, \ad, \b2 @ G = A*B2 123*4882a593Smuzhiyun veor t0q, t0q, t4q @ L = E + F 124*4882a593Smuzhiyun .ifc \b3, t4l 125*4882a593Smuzhiyun vext.8 t4l, \bd, \bd, #3 @ B3 126*4882a593Smuzhiyun .endif 127*4882a593Smuzhiyun vmull.p8 t2q, t2l, \bd @ J = A3*B 128*4882a593Smuzhiyun veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 129*4882a593Smuzhiyun veor t1q, t1q, t3q @ M = G + H 130*4882a593Smuzhiyun .ifc \b4, t3l 131*4882a593Smuzhiyun vext.8 t3l, \bd, \bd, #4 @ B4 132*4882a593Smuzhiyun .endif 133*4882a593Smuzhiyun vmull.p8 t4q, \ad, \b3 @ I = A*B3 134*4882a593Smuzhiyun veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 135*4882a593Smuzhiyun vmull.p8 t3q, \ad, \b4 @ K = A*B4 136*4882a593Smuzhiyun vand t0h, t0h, k48 137*4882a593Smuzhiyun vand t1h, t1h, k32 138*4882a593Smuzhiyun veor t2q, t2q, t4q @ N = I + J 139*4882a593Smuzhiyun veor t0l, t0l, t0h 140*4882a593Smuzhiyun veor t1l, t1l, t1h 141*4882a593Smuzhiyun veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 142*4882a593Smuzhiyun vand t2h, t2h, k16 143*4882a593Smuzhiyun veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 144*4882a593Smuzhiyun vmov.i64 t3h, #0 145*4882a593Smuzhiyun vext.8 t0q, t0q, t0q, #15 146*4882a593Smuzhiyun veor t2l, t2l, t2h 147*4882a593Smuzhiyun vext.8 t1q, t1q, t1q, #14 148*4882a593Smuzhiyun vmull.p8 \rq, \ad, \bd @ D = A*B 149*4882a593Smuzhiyun vext.8 t2q, t2q, t2q, #13 150*4882a593Smuzhiyun vext.8 t3q, t3q, t3q, #12 151*4882a593Smuzhiyun veor t0q, t0q, t1q 152*4882a593Smuzhiyun veor t2q, t2q, t3q 153*4882a593Smuzhiyun veor \rq, \rq, t0q 154*4882a593Smuzhiyun veor \rq, \rq, t2q 155*4882a593Smuzhiyun .endm 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun // 158*4882a593Smuzhiyun // PMULL (64x64->128) based reduction for CPUs that can do 159*4882a593Smuzhiyun // it in a single instruction. 160*4882a593Smuzhiyun // 161*4882a593Smuzhiyun .macro __pmull_reduce_p64 162*4882a593Smuzhiyun vmull.p64 T1, XL_L, MASK 163*4882a593Smuzhiyun 164*4882a593Smuzhiyun veor XH_L, XH_L, XM_H 165*4882a593Smuzhiyun vext.8 T1, T1, T1, #8 166*4882a593Smuzhiyun veor XL_H, XL_H, XM_L 167*4882a593Smuzhiyun veor T1, T1, XL 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun vmull.p64 XL, T1_H, MASK 170*4882a593Smuzhiyun .endm 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun // 173*4882a593Smuzhiyun // Alternative reduction for CPUs that lack support for the 174*4882a593Smuzhiyun // 64x64->128 PMULL instruction 175*4882a593Smuzhiyun // 176*4882a593Smuzhiyun .macro __pmull_reduce_p8 177*4882a593Smuzhiyun veor XL_H, XL_H, XM_L 178*4882a593Smuzhiyun veor XH_L, XH_L, XM_H 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun vshl.i64 T1, XL, #57 181*4882a593Smuzhiyun vshl.i64 T2, XL, #62 182*4882a593Smuzhiyun veor T1, T1, T2 183*4882a593Smuzhiyun vshl.i64 T2, XL, #63 184*4882a593Smuzhiyun veor T1, T1, T2 185*4882a593Smuzhiyun veor XL_H, XL_H, T1_L 186*4882a593Smuzhiyun veor XH_L, XH_L, T1_H 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun vshr.u64 T1, XL, #1 189*4882a593Smuzhiyun veor XH, XH, XL 190*4882a593Smuzhiyun veor XL, XL, T1 191*4882a593Smuzhiyun vshr.u64 T1, T1, #6 192*4882a593Smuzhiyun vshr.u64 XL, XL, #1 193*4882a593Smuzhiyun .endm 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun .macro ghash_update, pn 196*4882a593Smuzhiyun vld1.64 {XL}, [r1] 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun /* do the head block first, if supplied */ 199*4882a593Smuzhiyun ldr ip, [sp] 200*4882a593Smuzhiyun teq ip, #0 201*4882a593Smuzhiyun beq 0f 202*4882a593Smuzhiyun vld1.64 {T1}, [ip] 203*4882a593Smuzhiyun teq r0, #0 204*4882a593Smuzhiyun b 3f 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun0: .ifc \pn, p64 207*4882a593Smuzhiyun tst r0, #3 // skip until #blocks is a 208*4882a593Smuzhiyun bne 2f // round multiple of 4 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun vld1.8 {XL2-XM2}, [r2]! 211*4882a593Smuzhiyun1: vld1.8 {T3-T2}, [r2]! 212*4882a593Smuzhiyun vrev64.8 XL2, XL2 213*4882a593Smuzhiyun vrev64.8 XM2, XM2 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun subs r0, r0, #4 216*4882a593Smuzhiyun 217*4882a593Smuzhiyun vext.8 T1, XL2, XL2, #8 218*4882a593Smuzhiyun veor XL2_H, XL2_H, XL_L 219*4882a593Smuzhiyun veor XL, XL, T1 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun vrev64.8 T3, T3 222*4882a593Smuzhiyun vrev64.8 T1, T2 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun vmull.p64 XH, HH4_H, XL_H // a1 * b1 225*4882a593Smuzhiyun veor XL2_H, XL2_H, XL_H 226*4882a593Smuzhiyun vmull.p64 XL, HH4_L, XL_L // a0 * b0 227*4882a593Smuzhiyun vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 228*4882a593Smuzhiyun 229*4882a593Smuzhiyun vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 230*4882a593Smuzhiyun veor XM2_L, XM2_L, XM2_H 231*4882a593Smuzhiyun vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 232*4882a593Smuzhiyun vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun veor XH, XH, XH2 235*4882a593Smuzhiyun veor XL, XL, XL2 236*4882a593Smuzhiyun veor XM, XM, XM2 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun vmull.p64 XH2, HH_H, T3_L // a1 * b1 239*4882a593Smuzhiyun veor T3_L, T3_L, T3_H 240*4882a593Smuzhiyun vmull.p64 XL2, HH_L, T3_H // a0 * b0 241*4882a593Smuzhiyun vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun veor XH, XH, XH2 244*4882a593Smuzhiyun veor XL, XL, XL2 245*4882a593Smuzhiyun veor XM, XM, XM2 246*4882a593Smuzhiyun 247*4882a593Smuzhiyun vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 248*4882a593Smuzhiyun veor T1_L, T1_L, T1_H 249*4882a593Smuzhiyun vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 250*4882a593Smuzhiyun vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 251*4882a593Smuzhiyun 252*4882a593Smuzhiyun veor XH, XH, XH2 253*4882a593Smuzhiyun veor XL, XL, XL2 254*4882a593Smuzhiyun veor XM, XM, XM2 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun beq 4f 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun vld1.8 {XL2-XM2}, [r2]! 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun veor T1, XL, XH 261*4882a593Smuzhiyun veor XM, XM, T1 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun __pmull_reduce_p64 264*4882a593Smuzhiyun 265*4882a593Smuzhiyun veor T1, T1, XH 266*4882a593Smuzhiyun veor XL, XL, T1 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun b 1b 269*4882a593Smuzhiyun .endif 270*4882a593Smuzhiyun 271*4882a593Smuzhiyun2: vld1.64 {T1}, [r2]! 272*4882a593Smuzhiyun subs r0, r0, #1 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun3: /* multiply XL by SHASH in GF(2^128) */ 275*4882a593Smuzhiyun#ifndef CONFIG_CPU_BIG_ENDIAN 276*4882a593Smuzhiyun vrev64.8 T1, T1 277*4882a593Smuzhiyun#endif 278*4882a593Smuzhiyun vext.8 IN1, T1, T1, #8 279*4882a593Smuzhiyun veor T1_L, T1_L, XL_H 280*4882a593Smuzhiyun veor XL, XL, IN1 281*4882a593Smuzhiyun 282*4882a593Smuzhiyun __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 283*4882a593Smuzhiyun veor T1, T1, XL 284*4882a593Smuzhiyun __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 285*4882a593Smuzhiyun __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun4: veor T1, XL, XH 288*4882a593Smuzhiyun veor XM, XM, T1 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun __pmull_reduce_\pn 291*4882a593Smuzhiyun 292*4882a593Smuzhiyun veor T1, T1, XH 293*4882a593Smuzhiyun veor XL, XL, T1 294*4882a593Smuzhiyun 295*4882a593Smuzhiyun bne 0b 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun vst1.64 {XL}, [r1] 298*4882a593Smuzhiyun bx lr 299*4882a593Smuzhiyun .endm 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun /* 302*4882a593Smuzhiyun * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 303*4882a593Smuzhiyun * struct ghash_key const *k, const char *head) 304*4882a593Smuzhiyun */ 305*4882a593SmuzhiyunENTRY(pmull_ghash_update_p64) 306*4882a593Smuzhiyun vld1.64 {SHASH}, [r3]! 307*4882a593Smuzhiyun vld1.64 {HH}, [r3]! 308*4882a593Smuzhiyun vld1.64 {HH3-HH4}, [r3] 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun veor SHASH2_p64, SHASH_L, SHASH_H 311*4882a593Smuzhiyun veor SHASH2_H, HH_L, HH_H 312*4882a593Smuzhiyun veor HH34_L, HH3_L, HH3_H 313*4882a593Smuzhiyun veor HH34_H, HH4_L, HH4_H 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun vmov.i8 MASK, #0xe1 316*4882a593Smuzhiyun vshl.u64 MASK, MASK, #57 317*4882a593Smuzhiyun 318*4882a593Smuzhiyun ghash_update p64 319*4882a593SmuzhiyunENDPROC(pmull_ghash_update_p64) 320*4882a593Smuzhiyun 321*4882a593SmuzhiyunENTRY(pmull_ghash_update_p8) 322*4882a593Smuzhiyun vld1.64 {SHASH}, [r3] 323*4882a593Smuzhiyun veor SHASH2_p8, SHASH_L, SHASH_H 324*4882a593Smuzhiyun 325*4882a593Smuzhiyun vext.8 s1l, SHASH_L, SHASH_L, #1 326*4882a593Smuzhiyun vext.8 s2l, SHASH_L, SHASH_L, #2 327*4882a593Smuzhiyun vext.8 s3l, SHASH_L, SHASH_L, #3 328*4882a593Smuzhiyun vext.8 s4l, SHASH_L, SHASH_L, #4 329*4882a593Smuzhiyun vext.8 s1h, SHASH_H, SHASH_H, #1 330*4882a593Smuzhiyun vext.8 s2h, SHASH_H, SHASH_H, #2 331*4882a593Smuzhiyun vext.8 s3h, SHASH_H, SHASH_H, #3 332*4882a593Smuzhiyun vext.8 s4h, SHASH_H, SHASH_H, #4 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun vmov.i64 k16, #0xffff 335*4882a593Smuzhiyun vmov.i64 k32, #0xffffffff 336*4882a593Smuzhiyun vmov.i64 k48, #0xffffffffffff 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun ghash_update p8 339*4882a593SmuzhiyunENDPROC(pmull_ghash_update_p8) 340