1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Accelerated GHASH implementation with ARMv8 PMULL instructions. 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/assembler.h> 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun SHASH .req v0 12*4882a593Smuzhiyun SHASH2 .req v1 13*4882a593Smuzhiyun T1 .req v2 14*4882a593Smuzhiyun T2 .req v3 15*4882a593Smuzhiyun MASK .req v4 16*4882a593Smuzhiyun XM .req v5 17*4882a593Smuzhiyun XL .req v6 18*4882a593Smuzhiyun XH .req v7 19*4882a593Smuzhiyun IN1 .req v7 20*4882a593Smuzhiyun 21*4882a593Smuzhiyun k00_16 .req v8 22*4882a593Smuzhiyun k32_48 .req v9 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun t3 .req v10 25*4882a593Smuzhiyun t4 .req v11 26*4882a593Smuzhiyun t5 .req v12 27*4882a593Smuzhiyun t6 .req v13 28*4882a593Smuzhiyun t7 .req v14 29*4882a593Smuzhiyun t8 .req v15 30*4882a593Smuzhiyun t9 .req v16 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun perm1 .req v17 33*4882a593Smuzhiyun perm2 .req v18 34*4882a593Smuzhiyun perm3 .req v19 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun sh1 .req v20 37*4882a593Smuzhiyun sh2 .req v21 38*4882a593Smuzhiyun sh3 .req v22 39*4882a593Smuzhiyun sh4 .req v23 40*4882a593Smuzhiyun 41*4882a593Smuzhiyun ss1 .req v24 42*4882a593Smuzhiyun ss2 .req v25 43*4882a593Smuzhiyun ss3 .req v26 44*4882a593Smuzhiyun ss4 .req v27 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun XL2 .req v8 47*4882a593Smuzhiyun XM2 .req v9 48*4882a593Smuzhiyun XH2 .req v10 49*4882a593Smuzhiyun XL3 .req v11 50*4882a593Smuzhiyun XM3 .req v12 51*4882a593Smuzhiyun XH3 .req v13 52*4882a593Smuzhiyun TT3 .req v14 53*4882a593Smuzhiyun TT4 .req v15 54*4882a593Smuzhiyun HH .req v16 55*4882a593Smuzhiyun HH3 .req v17 56*4882a593Smuzhiyun HH4 .req v18 57*4882a593Smuzhiyun HH34 .req v19 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun .text 60*4882a593Smuzhiyun .arch armv8-a+crypto 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun .macro __pmull_p64, rd, rn, rm 63*4882a593Smuzhiyun pmull \rd\().1q, \rn\().1d, \rm\().1d 64*4882a593Smuzhiyun .endm 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun .macro __pmull2_p64, rd, rn, rm 67*4882a593Smuzhiyun pmull2 \rd\().1q, \rn\().2d, \rm\().2d 68*4882a593Smuzhiyun .endm 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun .macro __pmull_p8, rq, ad, bd 71*4882a593Smuzhiyun ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 72*4882a593Smuzhiyun ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 73*4882a593Smuzhiyun ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun __pmull_p8_\bd \rq, \ad 76*4882a593Smuzhiyun .endm 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun .macro __pmull2_p8, rq, ad, bd 79*4882a593Smuzhiyun tbl t3.16b, {\ad\().16b}, perm1.16b // A1 80*4882a593Smuzhiyun tbl t5.16b, {\ad\().16b}, perm2.16b // A2 81*4882a593Smuzhiyun tbl t7.16b, {\ad\().16b}, perm3.16b // A3 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun __pmull2_p8_\bd \rq, \ad 84*4882a593Smuzhiyun .endm 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun .macro __pmull_p8_SHASH, rq, ad 87*4882a593Smuzhiyun __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 88*4882a593Smuzhiyun .endm 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun .macro __pmull_p8_SHASH2, rq, ad 91*4882a593Smuzhiyun __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 92*4882a593Smuzhiyun .endm 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun .macro __pmull2_p8_SHASH, rq, ad 95*4882a593Smuzhiyun __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 96*4882a593Smuzhiyun .endm 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 99*4882a593Smuzhiyun pmull\t t3.8h, t3.\nb, \bd // F = A1*B 100*4882a593Smuzhiyun pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 101*4882a593Smuzhiyun pmull\t t5.8h, t5.\nb, \bd // H = A2*B 102*4882a593Smuzhiyun pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 103*4882a593Smuzhiyun pmull\t t7.8h, t7.\nb, \bd // J = A3*B 104*4882a593Smuzhiyun pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 105*4882a593Smuzhiyun pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 106*4882a593Smuzhiyun pmull\t \rq\().8h, \ad, \bd // D = A*B 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun eor t3.16b, t3.16b, t4.16b // L = E + F 109*4882a593Smuzhiyun eor t5.16b, t5.16b, t6.16b // M = G + H 110*4882a593Smuzhiyun eor t7.16b, t7.16b, t8.16b // N = I + J 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun uzp1 t4.2d, t3.2d, t5.2d 113*4882a593Smuzhiyun uzp2 t3.2d, t3.2d, t5.2d 114*4882a593Smuzhiyun uzp1 t6.2d, t7.2d, t9.2d 115*4882a593Smuzhiyun uzp2 t7.2d, t7.2d, t9.2d 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun // t3 = (L) (P0 + P1) << 8 118*4882a593Smuzhiyun // t5 = (M) (P2 + P3) << 16 119*4882a593Smuzhiyun eor t4.16b, t4.16b, t3.16b 120*4882a593Smuzhiyun and t3.16b, t3.16b, k32_48.16b 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun // t7 = (N) (P4 + P5) << 24 123*4882a593Smuzhiyun // t9 = (K) (P6 + P7) << 32 124*4882a593Smuzhiyun eor t6.16b, t6.16b, t7.16b 125*4882a593Smuzhiyun and t7.16b, t7.16b, k00_16.16b 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun eor t4.16b, t4.16b, t3.16b 128*4882a593Smuzhiyun eor t6.16b, t6.16b, t7.16b 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun zip2 t5.2d, t4.2d, t3.2d 131*4882a593Smuzhiyun zip1 t3.2d, t4.2d, t3.2d 132*4882a593Smuzhiyun zip2 t9.2d, t6.2d, t7.2d 133*4882a593Smuzhiyun zip1 t7.2d, t6.2d, t7.2d 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun ext t3.16b, t3.16b, t3.16b, #15 136*4882a593Smuzhiyun ext t5.16b, t5.16b, t5.16b, #14 137*4882a593Smuzhiyun ext t7.16b, t7.16b, t7.16b, #13 138*4882a593Smuzhiyun ext t9.16b, t9.16b, t9.16b, #12 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun eor t3.16b, t3.16b, t5.16b 141*4882a593Smuzhiyun eor t7.16b, t7.16b, t9.16b 142*4882a593Smuzhiyun eor \rq\().16b, \rq\().16b, t3.16b 143*4882a593Smuzhiyun eor \rq\().16b, \rq\().16b, t7.16b 144*4882a593Smuzhiyun .endm 145*4882a593Smuzhiyun 146*4882a593Smuzhiyun .macro __pmull_pre_p64 147*4882a593Smuzhiyun add x8, x3, #16 148*4882a593Smuzhiyun ld1 {HH.2d-HH4.2d}, [x8] 149*4882a593Smuzhiyun 150*4882a593Smuzhiyun trn1 SHASH2.2d, SHASH.2d, HH.2d 151*4882a593Smuzhiyun trn2 T1.2d, SHASH.2d, HH.2d 152*4882a593Smuzhiyun eor SHASH2.16b, SHASH2.16b, T1.16b 153*4882a593Smuzhiyun 154*4882a593Smuzhiyun trn1 HH34.2d, HH3.2d, HH4.2d 155*4882a593Smuzhiyun trn2 T1.2d, HH3.2d, HH4.2d 156*4882a593Smuzhiyun eor HH34.16b, HH34.16b, T1.16b 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun movi MASK.16b, #0xe1 159*4882a593Smuzhiyun shl MASK.2d, MASK.2d, #57 160*4882a593Smuzhiyun .endm 161*4882a593Smuzhiyun 162*4882a593Smuzhiyun .macro __pmull_pre_p8 163*4882a593Smuzhiyun ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 164*4882a593Smuzhiyun eor SHASH2.16b, SHASH2.16b, SHASH.16b 165*4882a593Smuzhiyun 166*4882a593Smuzhiyun // k00_16 := 0x0000000000000000_000000000000ffff 167*4882a593Smuzhiyun // k32_48 := 0x00000000ffffffff_0000ffffffffffff 168*4882a593Smuzhiyun movi k32_48.2d, #0xffffffff 169*4882a593Smuzhiyun mov k32_48.h[2], k32_48.h[0] 170*4882a593Smuzhiyun ushr k00_16.2d, k32_48.2d, #32 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun // prepare the permutation vectors 173*4882a593Smuzhiyun mov_q x5, 0x080f0e0d0c0b0a09 174*4882a593Smuzhiyun movi T1.8b, #8 175*4882a593Smuzhiyun dup perm1.2d, x5 176*4882a593Smuzhiyun eor perm1.16b, perm1.16b, T1.16b 177*4882a593Smuzhiyun ushr perm2.2d, perm1.2d, #8 178*4882a593Smuzhiyun ushr perm3.2d, perm1.2d, #16 179*4882a593Smuzhiyun ushr T1.2d, perm1.2d, #24 180*4882a593Smuzhiyun sli perm2.2d, perm1.2d, #56 181*4882a593Smuzhiyun sli perm3.2d, perm1.2d, #48 182*4882a593Smuzhiyun sli T1.2d, perm1.2d, #40 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun // precompute loop invariants 185*4882a593Smuzhiyun tbl sh1.16b, {SHASH.16b}, perm1.16b 186*4882a593Smuzhiyun tbl sh2.16b, {SHASH.16b}, perm2.16b 187*4882a593Smuzhiyun tbl sh3.16b, {SHASH.16b}, perm3.16b 188*4882a593Smuzhiyun tbl sh4.16b, {SHASH.16b}, T1.16b 189*4882a593Smuzhiyun ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 190*4882a593Smuzhiyun ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 191*4882a593Smuzhiyun ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 192*4882a593Smuzhiyun ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 193*4882a593Smuzhiyun .endm 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun // 196*4882a593Smuzhiyun // PMULL (64x64->128) based reduction for CPUs that can do 197*4882a593Smuzhiyun // it in a single instruction. 198*4882a593Smuzhiyun // 199*4882a593Smuzhiyun .macro __pmull_reduce_p64 200*4882a593Smuzhiyun pmull T2.1q, XL.1d, MASK.1d 201*4882a593Smuzhiyun eor XM.16b, XM.16b, T1.16b 202*4882a593Smuzhiyun 203*4882a593Smuzhiyun mov XH.d[0], XM.d[1] 204*4882a593Smuzhiyun mov XM.d[1], XL.d[0] 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun eor XL.16b, XM.16b, T2.16b 207*4882a593Smuzhiyun ext T2.16b, XL.16b, XL.16b, #8 208*4882a593Smuzhiyun pmull XL.1q, XL.1d, MASK.1d 209*4882a593Smuzhiyun .endm 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun // 212*4882a593Smuzhiyun // Alternative reduction for CPUs that lack support for the 213*4882a593Smuzhiyun // 64x64->128 PMULL instruction 214*4882a593Smuzhiyun // 215*4882a593Smuzhiyun .macro __pmull_reduce_p8 216*4882a593Smuzhiyun eor XM.16b, XM.16b, T1.16b 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun mov XL.d[1], XM.d[0] 219*4882a593Smuzhiyun mov XH.d[0], XM.d[1] 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun shl T1.2d, XL.2d, #57 222*4882a593Smuzhiyun shl T2.2d, XL.2d, #62 223*4882a593Smuzhiyun eor T2.16b, T2.16b, T1.16b 224*4882a593Smuzhiyun shl T1.2d, XL.2d, #63 225*4882a593Smuzhiyun eor T2.16b, T2.16b, T1.16b 226*4882a593Smuzhiyun ext T1.16b, XL.16b, XH.16b, #8 227*4882a593Smuzhiyun eor T2.16b, T2.16b, T1.16b 228*4882a593Smuzhiyun 229*4882a593Smuzhiyun mov XL.d[1], T2.d[0] 230*4882a593Smuzhiyun mov XH.d[0], T2.d[1] 231*4882a593Smuzhiyun 232*4882a593Smuzhiyun ushr T2.2d, XL.2d, #1 233*4882a593Smuzhiyun eor XH.16b, XH.16b, XL.16b 234*4882a593Smuzhiyun eor XL.16b, XL.16b, T2.16b 235*4882a593Smuzhiyun ushr T2.2d, T2.2d, #6 236*4882a593Smuzhiyun ushr XL.2d, XL.2d, #1 237*4882a593Smuzhiyun .endm 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun .macro __pmull_ghash, pn 240*4882a593Smuzhiyun ld1 {SHASH.2d}, [x3] 241*4882a593Smuzhiyun ld1 {XL.2d}, [x1] 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun __pmull_pre_\pn 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun /* do the head block first, if supplied */ 246*4882a593Smuzhiyun cbz x4, 0f 247*4882a593Smuzhiyun ld1 {T1.2d}, [x4] 248*4882a593Smuzhiyun mov x4, xzr 249*4882a593Smuzhiyun b 3f 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun0: .ifc \pn, p64 252*4882a593Smuzhiyun tbnz w0, #0, 2f // skip until #blocks is a 253*4882a593Smuzhiyun tbnz w0, #1, 2f // round multiple of 4 254*4882a593Smuzhiyun 255*4882a593Smuzhiyun1: ld1 {XM3.16b-TT4.16b}, [x2], #64 256*4882a593Smuzhiyun 257*4882a593Smuzhiyun sub w0, w0, #4 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun rev64 T1.16b, XM3.16b 260*4882a593Smuzhiyun rev64 T2.16b, XH3.16b 261*4882a593Smuzhiyun rev64 TT4.16b, TT4.16b 262*4882a593Smuzhiyun rev64 TT3.16b, TT3.16b 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun ext IN1.16b, TT4.16b, TT4.16b, #8 265*4882a593Smuzhiyun ext XL3.16b, TT3.16b, TT3.16b, #8 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun eor TT4.16b, TT4.16b, IN1.16b 268*4882a593Smuzhiyun pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 269*4882a593Smuzhiyun pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 270*4882a593Smuzhiyun pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun eor TT3.16b, TT3.16b, XL3.16b 273*4882a593Smuzhiyun pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 274*4882a593Smuzhiyun pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 275*4882a593Smuzhiyun pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun ext IN1.16b, T2.16b, T2.16b, #8 278*4882a593Smuzhiyun eor XL2.16b, XL2.16b, XL3.16b 279*4882a593Smuzhiyun eor XH2.16b, XH2.16b, XH3.16b 280*4882a593Smuzhiyun eor XM2.16b, XM2.16b, XM3.16b 281*4882a593Smuzhiyun 282*4882a593Smuzhiyun eor T2.16b, T2.16b, IN1.16b 283*4882a593Smuzhiyun pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 284*4882a593Smuzhiyun pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 285*4882a593Smuzhiyun pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun eor XL2.16b, XL2.16b, XL3.16b 288*4882a593Smuzhiyun eor XH2.16b, XH2.16b, XH3.16b 289*4882a593Smuzhiyun eor XM2.16b, XM2.16b, XM3.16b 290*4882a593Smuzhiyun 291*4882a593Smuzhiyun ext IN1.16b, T1.16b, T1.16b, #8 292*4882a593Smuzhiyun ext TT3.16b, XL.16b, XL.16b, #8 293*4882a593Smuzhiyun eor XL.16b, XL.16b, IN1.16b 294*4882a593Smuzhiyun eor T1.16b, T1.16b, TT3.16b 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 297*4882a593Smuzhiyun eor T1.16b, T1.16b, XL.16b 298*4882a593Smuzhiyun pmull XL.1q, HH4.1d, XL.1d // a0 * b0 299*4882a593Smuzhiyun pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun eor XL.16b, XL.16b, XL2.16b 302*4882a593Smuzhiyun eor XH.16b, XH.16b, XH2.16b 303*4882a593Smuzhiyun eor XM.16b, XM.16b, XM2.16b 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun eor T2.16b, XL.16b, XH.16b 306*4882a593Smuzhiyun ext T1.16b, XL.16b, XH.16b, #8 307*4882a593Smuzhiyun eor XM.16b, XM.16b, T2.16b 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun __pmull_reduce_p64 310*4882a593Smuzhiyun 311*4882a593Smuzhiyun eor T2.16b, T2.16b, XH.16b 312*4882a593Smuzhiyun eor XL.16b, XL.16b, T2.16b 313*4882a593Smuzhiyun 314*4882a593Smuzhiyun cbz w0, 5f 315*4882a593Smuzhiyun b 1b 316*4882a593Smuzhiyun .endif 317*4882a593Smuzhiyun 318*4882a593Smuzhiyun2: ld1 {T1.2d}, [x2], #16 319*4882a593Smuzhiyun sub w0, w0, #1 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun3: /* multiply XL by SHASH in GF(2^128) */ 322*4882a593SmuzhiyunCPU_LE( rev64 T1.16b, T1.16b ) 323*4882a593Smuzhiyun 324*4882a593Smuzhiyun ext T2.16b, XL.16b, XL.16b, #8 325*4882a593Smuzhiyun ext IN1.16b, T1.16b, T1.16b, #8 326*4882a593Smuzhiyun eor T1.16b, T1.16b, T2.16b 327*4882a593Smuzhiyun eor XL.16b, XL.16b, IN1.16b 328*4882a593Smuzhiyun 329*4882a593Smuzhiyun __pmull2_\pn XH, XL, SHASH // a1 * b1 330*4882a593Smuzhiyun eor T1.16b, T1.16b, XL.16b 331*4882a593Smuzhiyun __pmull_\pn XL, XL, SHASH // a0 * b0 332*4882a593Smuzhiyun __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun4: eor T2.16b, XL.16b, XH.16b 335*4882a593Smuzhiyun ext T1.16b, XL.16b, XH.16b, #8 336*4882a593Smuzhiyun eor XM.16b, XM.16b, T2.16b 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun __pmull_reduce_\pn 339*4882a593Smuzhiyun 340*4882a593Smuzhiyun eor T2.16b, T2.16b, XH.16b 341*4882a593Smuzhiyun eor XL.16b, XL.16b, T2.16b 342*4882a593Smuzhiyun 343*4882a593Smuzhiyun cbnz w0, 0b 344*4882a593Smuzhiyun 345*4882a593Smuzhiyun5: st1 {XL.2d}, [x1] 346*4882a593Smuzhiyun ret 347*4882a593Smuzhiyun .endm 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun /* 350*4882a593Smuzhiyun * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 351*4882a593Smuzhiyun * struct ghash_key const *k, const char *head) 352*4882a593Smuzhiyun */ 353*4882a593SmuzhiyunSYM_FUNC_START(pmull_ghash_update_p64) 354*4882a593Smuzhiyun __pmull_ghash p64 355*4882a593SmuzhiyunSYM_FUNC_END(pmull_ghash_update_p64) 356*4882a593Smuzhiyun 357*4882a593SmuzhiyunSYM_FUNC_START(pmull_ghash_update_p8) 358*4882a593Smuzhiyun __pmull_ghash p8 359*4882a593SmuzhiyunSYM_FUNC_END(pmull_ghash_update_p8) 360*4882a593Smuzhiyun 361*4882a593Smuzhiyun KS0 .req v8 362*4882a593Smuzhiyun KS1 .req v9 363*4882a593Smuzhiyun KS2 .req v10 364*4882a593Smuzhiyun KS3 .req v11 365*4882a593Smuzhiyun 366*4882a593Smuzhiyun INP0 .req v21 367*4882a593Smuzhiyun INP1 .req v22 368*4882a593Smuzhiyun INP2 .req v23 369*4882a593Smuzhiyun INP3 .req v24 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun K0 .req v25 372*4882a593Smuzhiyun K1 .req v26 373*4882a593Smuzhiyun K2 .req v27 374*4882a593Smuzhiyun K3 .req v28 375*4882a593Smuzhiyun K4 .req v12 376*4882a593Smuzhiyun K5 .req v13 377*4882a593Smuzhiyun K6 .req v4 378*4882a593Smuzhiyun K7 .req v5 379*4882a593Smuzhiyun K8 .req v14 380*4882a593Smuzhiyun K9 .req v15 381*4882a593Smuzhiyun KK .req v29 382*4882a593Smuzhiyun KL .req v30 383*4882a593Smuzhiyun KM .req v31 384*4882a593Smuzhiyun 385*4882a593Smuzhiyun .macro load_round_keys, rounds, rk, tmp 386*4882a593Smuzhiyun add \tmp, \rk, #64 387*4882a593Smuzhiyun ld1 {K0.4s-K3.4s}, [\rk] 388*4882a593Smuzhiyun ld1 {K4.4s-K5.4s}, [\tmp] 389*4882a593Smuzhiyun add \tmp, \rk, \rounds, lsl #4 390*4882a593Smuzhiyun sub \tmp, \tmp, #32 391*4882a593Smuzhiyun ld1 {KK.4s-KM.4s}, [\tmp] 392*4882a593Smuzhiyun .endm 393*4882a593Smuzhiyun 394*4882a593Smuzhiyun .macro enc_round, state, key 395*4882a593Smuzhiyun aese \state\().16b, \key\().16b 396*4882a593Smuzhiyun aesmc \state\().16b, \state\().16b 397*4882a593Smuzhiyun .endm 398*4882a593Smuzhiyun 399*4882a593Smuzhiyun .macro enc_qround, s0, s1, s2, s3, key 400*4882a593Smuzhiyun enc_round \s0, \key 401*4882a593Smuzhiyun enc_round \s1, \key 402*4882a593Smuzhiyun enc_round \s2, \key 403*4882a593Smuzhiyun enc_round \s3, \key 404*4882a593Smuzhiyun .endm 405*4882a593Smuzhiyun 406*4882a593Smuzhiyun .macro enc_block, state, rounds, rk, tmp 407*4882a593Smuzhiyun add \tmp, \rk, #96 408*4882a593Smuzhiyun ld1 {K6.4s-K7.4s}, [\tmp], #32 409*4882a593Smuzhiyun .irp key, K0, K1, K2, K3, K4 K5 410*4882a593Smuzhiyun enc_round \state, \key 411*4882a593Smuzhiyun .endr 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun tbnz \rounds, #2, .Lnot128_\@ 414*4882a593Smuzhiyun.Lout256_\@: 415*4882a593Smuzhiyun enc_round \state, K6 416*4882a593Smuzhiyun enc_round \state, K7 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun.Lout192_\@: 419*4882a593Smuzhiyun enc_round \state, KK 420*4882a593Smuzhiyun aese \state\().16b, KL.16b 421*4882a593Smuzhiyun eor \state\().16b, \state\().16b, KM.16b 422*4882a593Smuzhiyun 423*4882a593Smuzhiyun .subsection 1 424*4882a593Smuzhiyun.Lnot128_\@: 425*4882a593Smuzhiyun ld1 {K8.4s-K9.4s}, [\tmp], #32 426*4882a593Smuzhiyun enc_round \state, K6 427*4882a593Smuzhiyun enc_round \state, K7 428*4882a593Smuzhiyun ld1 {K6.4s-K7.4s}, [\tmp] 429*4882a593Smuzhiyun enc_round \state, K8 430*4882a593Smuzhiyun enc_round \state, K9 431*4882a593Smuzhiyun tbz \rounds, #1, .Lout192_\@ 432*4882a593Smuzhiyun b .Lout256_\@ 433*4882a593Smuzhiyun .previous 434*4882a593Smuzhiyun .endm 435*4882a593Smuzhiyun 436*4882a593Smuzhiyun .align 6 437*4882a593Smuzhiyun .macro pmull_gcm_do_crypt, enc 438*4882a593Smuzhiyun stp x29, x30, [sp, #-32]! 439*4882a593Smuzhiyun mov x29, sp 440*4882a593Smuzhiyun str x19, [sp, #24] 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun load_round_keys x7, x6, x8 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun ld1 {SHASH.2d}, [x3], #16 445*4882a593Smuzhiyun ld1 {HH.2d-HH4.2d}, [x3] 446*4882a593Smuzhiyun 447*4882a593Smuzhiyun trn1 SHASH2.2d, SHASH.2d, HH.2d 448*4882a593Smuzhiyun trn2 T1.2d, SHASH.2d, HH.2d 449*4882a593Smuzhiyun eor SHASH2.16b, SHASH2.16b, T1.16b 450*4882a593Smuzhiyun 451*4882a593Smuzhiyun trn1 HH34.2d, HH3.2d, HH4.2d 452*4882a593Smuzhiyun trn2 T1.2d, HH3.2d, HH4.2d 453*4882a593Smuzhiyun eor HH34.16b, HH34.16b, T1.16b 454*4882a593Smuzhiyun 455*4882a593Smuzhiyun ld1 {XL.2d}, [x4] 456*4882a593Smuzhiyun 457*4882a593Smuzhiyun cbz x0, 3f // tag only? 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun ldr w8, [x5, #12] // load lower counter 460*4882a593SmuzhiyunCPU_LE( rev w8, w8 ) 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun0: mov w9, #4 // max blocks per round 463*4882a593Smuzhiyun add x10, x0, #0xf 464*4882a593Smuzhiyun lsr x10, x10, #4 // remaining blocks 465*4882a593Smuzhiyun 466*4882a593Smuzhiyun subs x0, x0, #64 467*4882a593Smuzhiyun csel w9, w10, w9, mi 468*4882a593Smuzhiyun add w8, w8, w9 469*4882a593Smuzhiyun 470*4882a593Smuzhiyun bmi 1f 471*4882a593Smuzhiyun ld1 {INP0.16b-INP3.16b}, [x2], #64 472*4882a593Smuzhiyun .subsection 1 473*4882a593Smuzhiyun /* 474*4882a593Smuzhiyun * Populate the four input registers right to left with up to 63 bytes 475*4882a593Smuzhiyun * of data, using overlapping loads to avoid branches. 476*4882a593Smuzhiyun * 477*4882a593Smuzhiyun * INP0 INP1 INP2 INP3 478*4882a593Smuzhiyun * 1 byte | | | |x | 479*4882a593Smuzhiyun * 16 bytes | | | |xxxxxxxx| 480*4882a593Smuzhiyun * 17 bytes | | |xxxxxxxx|x | 481*4882a593Smuzhiyun * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | 482*4882a593Smuzhiyun * etc etc 483*4882a593Smuzhiyun * 484*4882a593Smuzhiyun * Note that this code may read up to 15 bytes before the start of 485*4882a593Smuzhiyun * the input. It is up to the calling code to ensure this is safe if 486*4882a593Smuzhiyun * this happens in the first iteration of the loop (i.e., when the 487*4882a593Smuzhiyun * input size is < 16 bytes) 488*4882a593Smuzhiyun */ 489*4882a593Smuzhiyun1: mov x15, #16 490*4882a593Smuzhiyun ands x19, x0, #0xf 491*4882a593Smuzhiyun csel x19, x19, x15, ne 492*4882a593Smuzhiyun adr_l x17, .Lpermute_table + 16 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun sub x11, x15, x19 495*4882a593Smuzhiyun add x12, x17, x11 496*4882a593Smuzhiyun sub x17, x17, x11 497*4882a593Smuzhiyun ld1 {T1.16b}, [x12] 498*4882a593Smuzhiyun sub x10, x1, x11 499*4882a593Smuzhiyun sub x11, x2, x11 500*4882a593Smuzhiyun 501*4882a593Smuzhiyun cmp x0, #-16 502*4882a593Smuzhiyun csel x14, x15, xzr, gt 503*4882a593Smuzhiyun cmp x0, #-32 504*4882a593Smuzhiyun csel x15, x15, xzr, gt 505*4882a593Smuzhiyun cmp x0, #-48 506*4882a593Smuzhiyun csel x16, x19, xzr, gt 507*4882a593Smuzhiyun csel x1, x1, x10, gt 508*4882a593Smuzhiyun csel x2, x2, x11, gt 509*4882a593Smuzhiyun 510*4882a593Smuzhiyun ld1 {INP0.16b}, [x2], x14 511*4882a593Smuzhiyun ld1 {INP1.16b}, [x2], x15 512*4882a593Smuzhiyun ld1 {INP2.16b}, [x2], x16 513*4882a593Smuzhiyun ld1 {INP3.16b}, [x2] 514*4882a593Smuzhiyun tbl INP3.16b, {INP3.16b}, T1.16b 515*4882a593Smuzhiyun b 2f 516*4882a593Smuzhiyun .previous 517*4882a593Smuzhiyun 518*4882a593Smuzhiyun2: .if \enc == 0 519*4882a593Smuzhiyun bl pmull_gcm_ghash_4x 520*4882a593Smuzhiyun .endif 521*4882a593Smuzhiyun 522*4882a593Smuzhiyun bl pmull_gcm_enc_4x 523*4882a593Smuzhiyun 524*4882a593Smuzhiyun tbnz x0, #63, 6f 525*4882a593Smuzhiyun st1 {INP0.16b-INP3.16b}, [x1], #64 526*4882a593Smuzhiyun .if \enc == 1 527*4882a593Smuzhiyun bl pmull_gcm_ghash_4x 528*4882a593Smuzhiyun .endif 529*4882a593Smuzhiyun bne 0b 530*4882a593Smuzhiyun 531*4882a593Smuzhiyun3: ldp x19, x10, [sp, #24] 532*4882a593Smuzhiyun cbz x10, 5f // output tag? 533*4882a593Smuzhiyun 534*4882a593Smuzhiyun ld1 {INP3.16b}, [x10] // load lengths[] 535*4882a593Smuzhiyun mov w9, #1 536*4882a593Smuzhiyun bl pmull_gcm_ghash_4x 537*4882a593Smuzhiyun 538*4882a593Smuzhiyun mov w11, #(0x1 << 24) // BE '1U' 539*4882a593Smuzhiyun ld1 {KS0.16b}, [x5] 540*4882a593Smuzhiyun mov KS0.s[3], w11 541*4882a593Smuzhiyun 542*4882a593Smuzhiyun enc_block KS0, x7, x6, x12 543*4882a593Smuzhiyun 544*4882a593Smuzhiyun ext XL.16b, XL.16b, XL.16b, #8 545*4882a593Smuzhiyun rev64 XL.16b, XL.16b 546*4882a593Smuzhiyun eor XL.16b, XL.16b, KS0.16b 547*4882a593Smuzhiyun 548*4882a593Smuzhiyun .if \enc == 1 549*4882a593Smuzhiyun st1 {XL.16b}, [x10] // store tag 550*4882a593Smuzhiyun .else 551*4882a593Smuzhiyun ldp x11, x12, [sp, #40] // load tag pointer and authsize 552*4882a593Smuzhiyun adr_l x17, .Lpermute_table 553*4882a593Smuzhiyun ld1 {KS0.16b}, [x11] // load supplied tag 554*4882a593Smuzhiyun add x17, x17, x12 555*4882a593Smuzhiyun ld1 {KS1.16b}, [x17] // load permute vector 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun cmeq XL.16b, XL.16b, KS0.16b // compare tags 558*4882a593Smuzhiyun mvn XL.16b, XL.16b // -1 for fail, 0 for pass 559*4882a593Smuzhiyun tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only 560*4882a593Smuzhiyun sminv b0, XL.16b // signed minimum across XL 561*4882a593Smuzhiyun smov w0, v0.b[0] // return b0 562*4882a593Smuzhiyun .endif 563*4882a593Smuzhiyun 564*4882a593Smuzhiyun4: ldp x29, x30, [sp], #32 565*4882a593Smuzhiyun ret 566*4882a593Smuzhiyun 567*4882a593Smuzhiyun5: 568*4882a593SmuzhiyunCPU_LE( rev w8, w8 ) 569*4882a593Smuzhiyun str w8, [x5, #12] // store lower counter 570*4882a593Smuzhiyun st1 {XL.2d}, [x4] 571*4882a593Smuzhiyun b 4b 572*4882a593Smuzhiyun 573*4882a593Smuzhiyun6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors 574*4882a593Smuzhiyun sub x17, x17, x19, lsl #1 575*4882a593Smuzhiyun 576*4882a593Smuzhiyun cmp w9, #1 577*4882a593Smuzhiyun beq 7f 578*4882a593Smuzhiyun .subsection 1 579*4882a593Smuzhiyun7: ld1 {INP2.16b}, [x1] 580*4882a593Smuzhiyun tbx INP2.16b, {INP3.16b}, T1.16b 581*4882a593Smuzhiyun mov INP3.16b, INP2.16b 582*4882a593Smuzhiyun b 8f 583*4882a593Smuzhiyun .previous 584*4882a593Smuzhiyun 585*4882a593Smuzhiyun st1 {INP0.16b}, [x1], x14 586*4882a593Smuzhiyun st1 {INP1.16b}, [x1], x15 587*4882a593Smuzhiyun st1 {INP2.16b}, [x1], x16 588*4882a593Smuzhiyun tbl INP3.16b, {INP3.16b}, T1.16b 589*4882a593Smuzhiyun tbx INP3.16b, {INP2.16b}, T2.16b 590*4882a593Smuzhiyun8: st1 {INP3.16b}, [x1] 591*4882a593Smuzhiyun 592*4882a593Smuzhiyun .if \enc == 1 593*4882a593Smuzhiyun ld1 {T1.16b}, [x17] 594*4882a593Smuzhiyun tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits 595*4882a593Smuzhiyun bl pmull_gcm_ghash_4x 596*4882a593Smuzhiyun .endif 597*4882a593Smuzhiyun b 3b 598*4882a593Smuzhiyun .endm 599*4882a593Smuzhiyun 600*4882a593Smuzhiyun /* 601*4882a593Smuzhiyun * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], 602*4882a593Smuzhiyun * struct ghash_key const *k, u64 dg[], u8 ctr[], 603*4882a593Smuzhiyun * int rounds, u8 tag) 604*4882a593Smuzhiyun */ 605*4882a593SmuzhiyunSYM_FUNC_START(pmull_gcm_encrypt) 606*4882a593Smuzhiyun pmull_gcm_do_crypt 1 607*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_encrypt) 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun /* 610*4882a593Smuzhiyun * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], 611*4882a593Smuzhiyun * struct ghash_key const *k, u64 dg[], u8 ctr[], 612*4882a593Smuzhiyun * int rounds, u8 tag) 613*4882a593Smuzhiyun */ 614*4882a593SmuzhiyunSYM_FUNC_START(pmull_gcm_decrypt) 615*4882a593Smuzhiyun pmull_gcm_do_crypt 0 616*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_decrypt) 617*4882a593Smuzhiyun 618*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) 619*4882a593Smuzhiyun movi MASK.16b, #0xe1 620*4882a593Smuzhiyun shl MASK.2d, MASK.2d, #57 621*4882a593Smuzhiyun 622*4882a593Smuzhiyun rev64 T1.16b, INP0.16b 623*4882a593Smuzhiyun rev64 T2.16b, INP1.16b 624*4882a593Smuzhiyun rev64 TT3.16b, INP2.16b 625*4882a593Smuzhiyun rev64 TT4.16b, INP3.16b 626*4882a593Smuzhiyun 627*4882a593Smuzhiyun ext XL.16b, XL.16b, XL.16b, #8 628*4882a593Smuzhiyun 629*4882a593Smuzhiyun tbz w9, #2, 0f // <4 blocks? 630*4882a593Smuzhiyun .subsection 1 631*4882a593Smuzhiyun0: movi XH2.16b, #0 632*4882a593Smuzhiyun movi XM2.16b, #0 633*4882a593Smuzhiyun movi XL2.16b, #0 634*4882a593Smuzhiyun 635*4882a593Smuzhiyun tbz w9, #0, 1f // 2 blocks? 636*4882a593Smuzhiyun tbz w9, #1, 2f // 1 block? 637*4882a593Smuzhiyun 638*4882a593Smuzhiyun eor T2.16b, T2.16b, XL.16b 639*4882a593Smuzhiyun ext T1.16b, T2.16b, T2.16b, #8 640*4882a593Smuzhiyun b .Lgh3 641*4882a593Smuzhiyun 642*4882a593Smuzhiyun1: eor TT3.16b, TT3.16b, XL.16b 643*4882a593Smuzhiyun ext T2.16b, TT3.16b, TT3.16b, #8 644*4882a593Smuzhiyun b .Lgh2 645*4882a593Smuzhiyun 646*4882a593Smuzhiyun2: eor TT4.16b, TT4.16b, XL.16b 647*4882a593Smuzhiyun ext IN1.16b, TT4.16b, TT4.16b, #8 648*4882a593Smuzhiyun b .Lgh1 649*4882a593Smuzhiyun .previous 650*4882a593Smuzhiyun 651*4882a593Smuzhiyun eor T1.16b, T1.16b, XL.16b 652*4882a593Smuzhiyun ext IN1.16b, T1.16b, T1.16b, #8 653*4882a593Smuzhiyun 654*4882a593Smuzhiyun pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 655*4882a593Smuzhiyun eor T1.16b, T1.16b, IN1.16b 656*4882a593Smuzhiyun pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 657*4882a593Smuzhiyun pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 658*4882a593Smuzhiyun 659*4882a593Smuzhiyun ext T1.16b, T2.16b, T2.16b, #8 660*4882a593Smuzhiyun.Lgh3: eor T2.16b, T2.16b, T1.16b 661*4882a593Smuzhiyun pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 662*4882a593Smuzhiyun pmull XL.1q, HH3.1d, T1.1d // a0 * b0 663*4882a593Smuzhiyun pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun eor XH2.16b, XH2.16b, XH.16b 666*4882a593Smuzhiyun eor XL2.16b, XL2.16b, XL.16b 667*4882a593Smuzhiyun eor XM2.16b, XM2.16b, XM.16b 668*4882a593Smuzhiyun 669*4882a593Smuzhiyun ext T2.16b, TT3.16b, TT3.16b, #8 670*4882a593Smuzhiyun.Lgh2: eor TT3.16b, TT3.16b, T2.16b 671*4882a593Smuzhiyun pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 672*4882a593Smuzhiyun pmull XL.1q, HH.1d, T2.1d // a0 * b0 673*4882a593Smuzhiyun pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 674*4882a593Smuzhiyun 675*4882a593Smuzhiyun eor XH2.16b, XH2.16b, XH.16b 676*4882a593Smuzhiyun eor XL2.16b, XL2.16b, XL.16b 677*4882a593Smuzhiyun eor XM2.16b, XM2.16b, XM.16b 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun ext IN1.16b, TT4.16b, TT4.16b, #8 680*4882a593Smuzhiyun.Lgh1: eor TT4.16b, TT4.16b, IN1.16b 681*4882a593Smuzhiyun pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 682*4882a593Smuzhiyun pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 683*4882a593Smuzhiyun pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 684*4882a593Smuzhiyun 685*4882a593Smuzhiyun eor XH.16b, XH.16b, XH2.16b 686*4882a593Smuzhiyun eor XL.16b, XL.16b, XL2.16b 687*4882a593Smuzhiyun eor XM.16b, XM.16b, XM2.16b 688*4882a593Smuzhiyun 689*4882a593Smuzhiyun eor T2.16b, XL.16b, XH.16b 690*4882a593Smuzhiyun ext T1.16b, XL.16b, XH.16b, #8 691*4882a593Smuzhiyun eor XM.16b, XM.16b, T2.16b 692*4882a593Smuzhiyun 693*4882a593Smuzhiyun __pmull_reduce_p64 694*4882a593Smuzhiyun 695*4882a593Smuzhiyun eor T2.16b, T2.16b, XH.16b 696*4882a593Smuzhiyun eor XL.16b, XL.16b, T2.16b 697*4882a593Smuzhiyun 698*4882a593Smuzhiyun ret 699*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_ghash_4x) 700*4882a593Smuzhiyun 701*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) 702*4882a593Smuzhiyun ld1 {KS0.16b}, [x5] // load upper counter 703*4882a593Smuzhiyun sub w10, w8, #4 704*4882a593Smuzhiyun sub w11, w8, #3 705*4882a593Smuzhiyun sub w12, w8, #2 706*4882a593Smuzhiyun sub w13, w8, #1 707*4882a593Smuzhiyun rev w10, w10 708*4882a593Smuzhiyun rev w11, w11 709*4882a593Smuzhiyun rev w12, w12 710*4882a593Smuzhiyun rev w13, w13 711*4882a593Smuzhiyun mov KS1.16b, KS0.16b 712*4882a593Smuzhiyun mov KS2.16b, KS0.16b 713*4882a593Smuzhiyun mov KS3.16b, KS0.16b 714*4882a593Smuzhiyun ins KS0.s[3], w10 // set lower counter 715*4882a593Smuzhiyun ins KS1.s[3], w11 716*4882a593Smuzhiyun ins KS2.s[3], w12 717*4882a593Smuzhiyun ins KS3.s[3], w13 718*4882a593Smuzhiyun 719*4882a593Smuzhiyun add x10, x6, #96 // round key pointer 720*4882a593Smuzhiyun ld1 {K6.4s-K7.4s}, [x10], #32 721*4882a593Smuzhiyun .irp key, K0, K1, K2, K3, K4, K5 722*4882a593Smuzhiyun enc_qround KS0, KS1, KS2, KS3, \key 723*4882a593Smuzhiyun .endr 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun tbnz x7, #2, .Lnot128 726*4882a593Smuzhiyun .subsection 1 727*4882a593Smuzhiyun.Lnot128: 728*4882a593Smuzhiyun ld1 {K8.4s-K9.4s}, [x10], #32 729*4882a593Smuzhiyun .irp key, K6, K7 730*4882a593Smuzhiyun enc_qround KS0, KS1, KS2, KS3, \key 731*4882a593Smuzhiyun .endr 732*4882a593Smuzhiyun ld1 {K6.4s-K7.4s}, [x10] 733*4882a593Smuzhiyun .irp key, K8, K9 734*4882a593Smuzhiyun enc_qround KS0, KS1, KS2, KS3, \key 735*4882a593Smuzhiyun .endr 736*4882a593Smuzhiyun tbz x7, #1, .Lout192 737*4882a593Smuzhiyun b .Lout256 738*4882a593Smuzhiyun .previous 739*4882a593Smuzhiyun 740*4882a593Smuzhiyun.Lout256: 741*4882a593Smuzhiyun .irp key, K6, K7 742*4882a593Smuzhiyun enc_qround KS0, KS1, KS2, KS3, \key 743*4882a593Smuzhiyun .endr 744*4882a593Smuzhiyun 745*4882a593Smuzhiyun.Lout192: 746*4882a593Smuzhiyun enc_qround KS0, KS1, KS2, KS3, KK 747*4882a593Smuzhiyun 748*4882a593Smuzhiyun aese KS0.16b, KL.16b 749*4882a593Smuzhiyun aese KS1.16b, KL.16b 750*4882a593Smuzhiyun aese KS2.16b, KL.16b 751*4882a593Smuzhiyun aese KS3.16b, KL.16b 752*4882a593Smuzhiyun 753*4882a593Smuzhiyun eor KS0.16b, KS0.16b, KM.16b 754*4882a593Smuzhiyun eor KS1.16b, KS1.16b, KM.16b 755*4882a593Smuzhiyun eor KS2.16b, KS2.16b, KM.16b 756*4882a593Smuzhiyun eor KS3.16b, KS3.16b, KM.16b 757*4882a593Smuzhiyun 758*4882a593Smuzhiyun eor INP0.16b, INP0.16b, KS0.16b 759*4882a593Smuzhiyun eor INP1.16b, INP1.16b, KS1.16b 760*4882a593Smuzhiyun eor INP2.16b, INP2.16b, KS2.16b 761*4882a593Smuzhiyun eor INP3.16b, INP3.16b, KS3.16b 762*4882a593Smuzhiyun 763*4882a593Smuzhiyun ret 764*4882a593SmuzhiyunSYM_FUNC_END(pmull_gcm_enc_4x) 765*4882a593Smuzhiyun 766*4882a593Smuzhiyun .section ".rodata", "a" 767*4882a593Smuzhiyun .align 6 768*4882a593Smuzhiyun.Lpermute_table: 769*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 770*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 771*4882a593Smuzhiyun .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 772*4882a593Smuzhiyun .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 773*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 774*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 775*4882a593Smuzhiyun .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 776*4882a593Smuzhiyun .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 777*4882a593Smuzhiyun .previous 778