1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/assembler.h> 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func) 12*4882a593Smuzhiyun#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func) 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun xtsmask .req v7 15*4882a593Smuzhiyun cbciv .req v7 16*4882a593Smuzhiyun vctr .req v4 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun .macro xts_reload_mask, tmp 19*4882a593Smuzhiyun xts_load_mask \tmp 20*4882a593Smuzhiyun .endm 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun /* special case for the neon-bs driver calling into this one for CTS */ 23*4882a593Smuzhiyun .macro xts_cts_skip_tw, reg, lbl 24*4882a593Smuzhiyun tbnz \reg, #1, \lbl 25*4882a593Smuzhiyun .endm 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun /* multiply by polynomial 'x' in GF(2^8) */ 28*4882a593Smuzhiyun .macro mul_by_x, out, in, temp, const 29*4882a593Smuzhiyun sshr \temp, \in, #7 30*4882a593Smuzhiyun shl \out, \in, #1 31*4882a593Smuzhiyun and \temp, \temp, \const 32*4882a593Smuzhiyun eor \out, \out, \temp 33*4882a593Smuzhiyun .endm 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun /* multiply by polynomial 'x^2' in GF(2^8) */ 36*4882a593Smuzhiyun .macro mul_by_x2, out, in, temp, const 37*4882a593Smuzhiyun ushr \temp, \in, #6 38*4882a593Smuzhiyun shl \out, \in, #2 39*4882a593Smuzhiyun pmul \temp, \temp, \const 40*4882a593Smuzhiyun eor \out, \out, \temp 41*4882a593Smuzhiyun .endm 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun /* preload the entire Sbox */ 44*4882a593Smuzhiyun .macro prepare, sbox, shiftrows, temp 45*4882a593Smuzhiyun movi v12.16b, #0x1b 46*4882a593Smuzhiyun ldr_l q13, \shiftrows, \temp 47*4882a593Smuzhiyun ldr_l q14, .Lror32by8, \temp 48*4882a593Smuzhiyun adr_l \temp, \sbox 49*4882a593Smuzhiyun ld1 {v16.16b-v19.16b}, [\temp], #64 50*4882a593Smuzhiyun ld1 {v20.16b-v23.16b}, [\temp], #64 51*4882a593Smuzhiyun ld1 {v24.16b-v27.16b}, [\temp], #64 52*4882a593Smuzhiyun ld1 {v28.16b-v31.16b}, [\temp] 53*4882a593Smuzhiyun .endm 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun /* do preload for encryption */ 56*4882a593Smuzhiyun .macro enc_prepare, ignore0, ignore1, temp 57*4882a593Smuzhiyun prepare crypto_aes_sbox, .LForward_ShiftRows, \temp 58*4882a593Smuzhiyun .endm 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun .macro enc_switch_key, ignore0, ignore1, temp 61*4882a593Smuzhiyun /* do nothing */ 62*4882a593Smuzhiyun .endm 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun /* do preload for decryption */ 65*4882a593Smuzhiyun .macro dec_prepare, ignore0, ignore1, temp 66*4882a593Smuzhiyun prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp 67*4882a593Smuzhiyun .endm 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun /* apply SubBytes transformation using the the preloaded Sbox */ 70*4882a593Smuzhiyun .macro sub_bytes, in 71*4882a593Smuzhiyun sub v9.16b, \in\().16b, v15.16b 72*4882a593Smuzhiyun tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b 73*4882a593Smuzhiyun sub v10.16b, v9.16b, v15.16b 74*4882a593Smuzhiyun tbx \in\().16b, {v20.16b-v23.16b}, v9.16b 75*4882a593Smuzhiyun sub v11.16b, v10.16b, v15.16b 76*4882a593Smuzhiyun tbx \in\().16b, {v24.16b-v27.16b}, v10.16b 77*4882a593Smuzhiyun tbx \in\().16b, {v28.16b-v31.16b}, v11.16b 78*4882a593Smuzhiyun .endm 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun /* apply MixColumns transformation */ 81*4882a593Smuzhiyun .macro mix_columns, in, enc 82*4882a593Smuzhiyun .if \enc == 0 83*4882a593Smuzhiyun /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ 84*4882a593Smuzhiyun mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b 85*4882a593Smuzhiyun eor \in\().16b, \in\().16b, v8.16b 86*4882a593Smuzhiyun rev32 v8.8h, v8.8h 87*4882a593Smuzhiyun eor \in\().16b, \in\().16b, v8.16b 88*4882a593Smuzhiyun .endif 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b 91*4882a593Smuzhiyun rev32 v8.8h, \in\().8h 92*4882a593Smuzhiyun eor v8.16b, v8.16b, v9.16b 93*4882a593Smuzhiyun eor \in\().16b, \in\().16b, v8.16b 94*4882a593Smuzhiyun tbl \in\().16b, {\in\().16b}, v14.16b 95*4882a593Smuzhiyun eor \in\().16b, \in\().16b, v8.16b 96*4882a593Smuzhiyun .endm 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun .macro do_block, enc, in, rounds, rk, rkp, i 99*4882a593Smuzhiyun ld1 {v15.4s}, [\rk] 100*4882a593Smuzhiyun add \rkp, \rk, #16 101*4882a593Smuzhiyun mov \i, \rounds 102*4882a593Smuzhiyun1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 103*4882a593Smuzhiyun movi v15.16b, #0x40 104*4882a593Smuzhiyun tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ 105*4882a593Smuzhiyun sub_bytes \in 106*4882a593Smuzhiyun subs \i, \i, #1 107*4882a593Smuzhiyun ld1 {v15.4s}, [\rkp], #16 108*4882a593Smuzhiyun beq 2222f 109*4882a593Smuzhiyun mix_columns \in, \enc 110*4882a593Smuzhiyun b 1111b 111*4882a593Smuzhiyun2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 112*4882a593Smuzhiyun .endm 113*4882a593Smuzhiyun 114*4882a593Smuzhiyun .macro encrypt_block, in, rounds, rk, rkp, i 115*4882a593Smuzhiyun do_block 1, \in, \rounds, \rk, \rkp, \i 116*4882a593Smuzhiyun .endm 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun .macro decrypt_block, in, rounds, rk, rkp, i 119*4882a593Smuzhiyun do_block 0, \in, \rounds, \rk, \rkp, \i 120*4882a593Smuzhiyun .endm 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun /* 123*4882a593Smuzhiyun * Interleaved versions: functionally equivalent to the 124*4882a593Smuzhiyun * ones above, but applied to AES states in parallel. 125*4882a593Smuzhiyun */ 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun .macro sub_bytes_4x, in0, in1, in2, in3 128*4882a593Smuzhiyun sub v8.16b, \in0\().16b, v15.16b 129*4882a593Smuzhiyun tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b 130*4882a593Smuzhiyun sub v9.16b, \in1\().16b, v15.16b 131*4882a593Smuzhiyun tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b 132*4882a593Smuzhiyun sub v10.16b, \in2\().16b, v15.16b 133*4882a593Smuzhiyun tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b 134*4882a593Smuzhiyun sub v11.16b, \in3\().16b, v15.16b 135*4882a593Smuzhiyun tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b 136*4882a593Smuzhiyun tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b 137*4882a593Smuzhiyun tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b 138*4882a593Smuzhiyun sub v8.16b, v8.16b, v15.16b 139*4882a593Smuzhiyun tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b 140*4882a593Smuzhiyun sub v9.16b, v9.16b, v15.16b 141*4882a593Smuzhiyun tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b 142*4882a593Smuzhiyun sub v10.16b, v10.16b, v15.16b 143*4882a593Smuzhiyun tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b 144*4882a593Smuzhiyun sub v11.16b, v11.16b, v15.16b 145*4882a593Smuzhiyun tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b 146*4882a593Smuzhiyun sub v8.16b, v8.16b, v15.16b 147*4882a593Smuzhiyun tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b 148*4882a593Smuzhiyun sub v9.16b, v9.16b, v15.16b 149*4882a593Smuzhiyun tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b 150*4882a593Smuzhiyun sub v10.16b, v10.16b, v15.16b 151*4882a593Smuzhiyun tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b 152*4882a593Smuzhiyun sub v11.16b, v11.16b, v15.16b 153*4882a593Smuzhiyun tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b 154*4882a593Smuzhiyun tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b 155*4882a593Smuzhiyun tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b 156*4882a593Smuzhiyun .endm 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const 159*4882a593Smuzhiyun sshr \tmp0\().16b, \in0\().16b, #7 160*4882a593Smuzhiyun shl \out0\().16b, \in0\().16b, #1 161*4882a593Smuzhiyun sshr \tmp1\().16b, \in1\().16b, #7 162*4882a593Smuzhiyun and \tmp0\().16b, \tmp0\().16b, \const\().16b 163*4882a593Smuzhiyun shl \out1\().16b, \in1\().16b, #1 164*4882a593Smuzhiyun and \tmp1\().16b, \tmp1\().16b, \const\().16b 165*4882a593Smuzhiyun eor \out0\().16b, \out0\().16b, \tmp0\().16b 166*4882a593Smuzhiyun eor \out1\().16b, \out1\().16b, \tmp1\().16b 167*4882a593Smuzhiyun .endm 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const 170*4882a593Smuzhiyun ushr \tmp0\().16b, \in0\().16b, #6 171*4882a593Smuzhiyun shl \out0\().16b, \in0\().16b, #2 172*4882a593Smuzhiyun ushr \tmp1\().16b, \in1\().16b, #6 173*4882a593Smuzhiyun pmul \tmp0\().16b, \tmp0\().16b, \const\().16b 174*4882a593Smuzhiyun shl \out1\().16b, \in1\().16b, #2 175*4882a593Smuzhiyun pmul \tmp1\().16b, \tmp1\().16b, \const\().16b 176*4882a593Smuzhiyun eor \out0\().16b, \out0\().16b, \tmp0\().16b 177*4882a593Smuzhiyun eor \out1\().16b, \out1\().16b, \tmp1\().16b 178*4882a593Smuzhiyun .endm 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun .macro mix_columns_2x, in0, in1, enc 181*4882a593Smuzhiyun .if \enc == 0 182*4882a593Smuzhiyun /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ 183*4882a593Smuzhiyun mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 184*4882a593Smuzhiyun eor \in0\().16b, \in0\().16b, v8.16b 185*4882a593Smuzhiyun rev32 v8.8h, v8.8h 186*4882a593Smuzhiyun eor \in1\().16b, \in1\().16b, v9.16b 187*4882a593Smuzhiyun rev32 v9.8h, v9.8h 188*4882a593Smuzhiyun eor \in0\().16b, \in0\().16b, v8.16b 189*4882a593Smuzhiyun eor \in1\().16b, \in1\().16b, v9.16b 190*4882a593Smuzhiyun .endif 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 193*4882a593Smuzhiyun rev32 v10.8h, \in0\().8h 194*4882a593Smuzhiyun rev32 v11.8h, \in1\().8h 195*4882a593Smuzhiyun eor v10.16b, v10.16b, v8.16b 196*4882a593Smuzhiyun eor v11.16b, v11.16b, v9.16b 197*4882a593Smuzhiyun eor \in0\().16b, \in0\().16b, v10.16b 198*4882a593Smuzhiyun eor \in1\().16b, \in1\().16b, v11.16b 199*4882a593Smuzhiyun tbl \in0\().16b, {\in0\().16b}, v14.16b 200*4882a593Smuzhiyun tbl \in1\().16b, {\in1\().16b}, v14.16b 201*4882a593Smuzhiyun eor \in0\().16b, \in0\().16b, v10.16b 202*4882a593Smuzhiyun eor \in1\().16b, \in1\().16b, v11.16b 203*4882a593Smuzhiyun .endm 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i 206*4882a593Smuzhiyun ld1 {v15.4s}, [\rk] 207*4882a593Smuzhiyun add \rkp, \rk, #16 208*4882a593Smuzhiyun mov \i, \rounds 209*4882a593Smuzhiyun1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 210*4882a593Smuzhiyun eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 211*4882a593Smuzhiyun eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 212*4882a593Smuzhiyun eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 213*4882a593Smuzhiyun movi v15.16b, #0x40 214*4882a593Smuzhiyun tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ 215*4882a593Smuzhiyun tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ 216*4882a593Smuzhiyun tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ 217*4882a593Smuzhiyun tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ 218*4882a593Smuzhiyun sub_bytes_4x \in0, \in1, \in2, \in3 219*4882a593Smuzhiyun subs \i, \i, #1 220*4882a593Smuzhiyun ld1 {v15.4s}, [\rkp], #16 221*4882a593Smuzhiyun beq 2222f 222*4882a593Smuzhiyun mix_columns_2x \in0, \in1, \enc 223*4882a593Smuzhiyun mix_columns_2x \in2, \in3, \enc 224*4882a593Smuzhiyun b 1111b 225*4882a593Smuzhiyun2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 226*4882a593Smuzhiyun eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 227*4882a593Smuzhiyun eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 228*4882a593Smuzhiyun eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 229*4882a593Smuzhiyun .endm 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 232*4882a593Smuzhiyun do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 233*4882a593Smuzhiyun .endm 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 236*4882a593Smuzhiyun do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 237*4882a593Smuzhiyun .endm 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun#include "aes-modes.S" 240*4882a593Smuzhiyun 241*4882a593Smuzhiyun .section ".rodata", "a" 242*4882a593Smuzhiyun .align 4 243*4882a593Smuzhiyun.LForward_ShiftRows: 244*4882a593Smuzhiyun .octa 0x0b06010c07020d08030e09040f0a0500 245*4882a593Smuzhiyun 246*4882a593Smuzhiyun.LReverse_ShiftRows: 247*4882a593Smuzhiyun .octa 0x0306090c0f0205080b0e0104070a0d00 248*4882a593Smuzhiyun 249*4882a593Smuzhiyun.Lror32by8: 250*4882a593Smuzhiyun .octa 0x0c0f0e0d080b0a090407060500030201 251