1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/assembler.h> 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun .text 12*4882a593Smuzhiyun .arch armv8-a 13*4882a593Smuzhiyun .fpu crypto-neon-fp-armv8 14*4882a593Smuzhiyun .align 3 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun .macro enc_round, state, key 17*4882a593Smuzhiyun aese.8 \state, \key 18*4882a593Smuzhiyun aesmc.8 \state, \state 19*4882a593Smuzhiyun .endm 20*4882a593Smuzhiyun 21*4882a593Smuzhiyun .macro dec_round, state, key 22*4882a593Smuzhiyun aesd.8 \state, \key 23*4882a593Smuzhiyun aesimc.8 \state, \state 24*4882a593Smuzhiyun .endm 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun .macro enc_dround, key1, key2 27*4882a593Smuzhiyun enc_round q0, \key1 28*4882a593Smuzhiyun enc_round q0, \key2 29*4882a593Smuzhiyun .endm 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun .macro dec_dround, key1, key2 32*4882a593Smuzhiyun dec_round q0, \key1 33*4882a593Smuzhiyun dec_round q0, \key2 34*4882a593Smuzhiyun .endm 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun .macro enc_fround, key1, key2, key3 37*4882a593Smuzhiyun enc_round q0, \key1 38*4882a593Smuzhiyun aese.8 q0, \key2 39*4882a593Smuzhiyun veor q0, q0, \key3 40*4882a593Smuzhiyun .endm 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun .macro dec_fround, key1, key2, key3 43*4882a593Smuzhiyun dec_round q0, \key1 44*4882a593Smuzhiyun aesd.8 q0, \key2 45*4882a593Smuzhiyun veor q0, q0, \key3 46*4882a593Smuzhiyun .endm 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun .macro enc_dround_4x, key1, key2 49*4882a593Smuzhiyun enc_round q0, \key1 50*4882a593Smuzhiyun enc_round q1, \key1 51*4882a593Smuzhiyun enc_round q2, \key1 52*4882a593Smuzhiyun enc_round q3, \key1 53*4882a593Smuzhiyun enc_round q0, \key2 54*4882a593Smuzhiyun enc_round q1, \key2 55*4882a593Smuzhiyun enc_round q2, \key2 56*4882a593Smuzhiyun enc_round q3, \key2 57*4882a593Smuzhiyun .endm 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun .macro dec_dround_4x, key1, key2 60*4882a593Smuzhiyun dec_round q0, \key1 61*4882a593Smuzhiyun dec_round q1, \key1 62*4882a593Smuzhiyun dec_round q2, \key1 63*4882a593Smuzhiyun dec_round q3, \key1 64*4882a593Smuzhiyun dec_round q0, \key2 65*4882a593Smuzhiyun dec_round q1, \key2 66*4882a593Smuzhiyun dec_round q2, \key2 67*4882a593Smuzhiyun dec_round q3, \key2 68*4882a593Smuzhiyun .endm 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun .macro enc_fround_4x, key1, key2, key3 71*4882a593Smuzhiyun enc_round q0, \key1 72*4882a593Smuzhiyun enc_round q1, \key1 73*4882a593Smuzhiyun enc_round q2, \key1 74*4882a593Smuzhiyun enc_round q3, \key1 75*4882a593Smuzhiyun aese.8 q0, \key2 76*4882a593Smuzhiyun aese.8 q1, \key2 77*4882a593Smuzhiyun aese.8 q2, \key2 78*4882a593Smuzhiyun aese.8 q3, \key2 79*4882a593Smuzhiyun veor q0, q0, \key3 80*4882a593Smuzhiyun veor q1, q1, \key3 81*4882a593Smuzhiyun veor q2, q2, \key3 82*4882a593Smuzhiyun veor q3, q3, \key3 83*4882a593Smuzhiyun .endm 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun .macro dec_fround_4x, key1, key2, key3 86*4882a593Smuzhiyun dec_round q0, \key1 87*4882a593Smuzhiyun dec_round q1, \key1 88*4882a593Smuzhiyun dec_round q2, \key1 89*4882a593Smuzhiyun dec_round q3, \key1 90*4882a593Smuzhiyun aesd.8 q0, \key2 91*4882a593Smuzhiyun aesd.8 q1, \key2 92*4882a593Smuzhiyun aesd.8 q2, \key2 93*4882a593Smuzhiyun aesd.8 q3, \key2 94*4882a593Smuzhiyun veor q0, q0, \key3 95*4882a593Smuzhiyun veor q1, q1, \key3 96*4882a593Smuzhiyun veor q2, q2, \key3 97*4882a593Smuzhiyun veor q3, q3, \key3 98*4882a593Smuzhiyun .endm 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun .macro do_block, dround, fround 101*4882a593Smuzhiyun cmp r3, #12 @ which key size? 102*4882a593Smuzhiyun vld1.32 {q10-q11}, [ip]! 103*4882a593Smuzhiyun \dround q8, q9 104*4882a593Smuzhiyun vld1.32 {q12-q13}, [ip]! 105*4882a593Smuzhiyun \dround q10, q11 106*4882a593Smuzhiyun vld1.32 {q10-q11}, [ip]! 107*4882a593Smuzhiyun \dround q12, q13 108*4882a593Smuzhiyun vld1.32 {q12-q13}, [ip]! 109*4882a593Smuzhiyun \dround q10, q11 110*4882a593Smuzhiyun blo 0f @ AES-128: 10 rounds 111*4882a593Smuzhiyun vld1.32 {q10-q11}, [ip]! 112*4882a593Smuzhiyun \dround q12, q13 113*4882a593Smuzhiyun beq 1f @ AES-192: 12 rounds 114*4882a593Smuzhiyun vld1.32 {q12-q13}, [ip] 115*4882a593Smuzhiyun \dround q10, q11 116*4882a593Smuzhiyun0: \fround q12, q13, q14 117*4882a593Smuzhiyun bx lr 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun1: \fround q10, q11, q14 120*4882a593Smuzhiyun bx lr 121*4882a593Smuzhiyun .endm 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun /* 124*4882a593Smuzhiyun * Internal, non-AAPCS compliant functions that implement the core AES 125*4882a593Smuzhiyun * transforms. These should preserve all registers except q0 - q2 and ip 126*4882a593Smuzhiyun * Arguments: 127*4882a593Smuzhiyun * q0 : first in/output block 128*4882a593Smuzhiyun * q1 : second in/output block (_4x version only) 129*4882a593Smuzhiyun * q2 : third in/output block (_4x version only) 130*4882a593Smuzhiyun * q3 : fourth in/output block (_4x version only) 131*4882a593Smuzhiyun * q8 : first round key 132*4882a593Smuzhiyun * q9 : secound round key 133*4882a593Smuzhiyun * q14 : final round key 134*4882a593Smuzhiyun * r2 : address of round key array 135*4882a593Smuzhiyun * r3 : number of rounds 136*4882a593Smuzhiyun */ 137*4882a593Smuzhiyun .align 6 138*4882a593Smuzhiyunaes_encrypt: 139*4882a593Smuzhiyun add ip, r2, #32 @ 3rd round key 140*4882a593Smuzhiyun.Laes_encrypt_tweak: 141*4882a593Smuzhiyun do_block enc_dround, enc_fround 142*4882a593SmuzhiyunENDPROC(aes_encrypt) 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun .align 6 145*4882a593Smuzhiyunaes_decrypt: 146*4882a593Smuzhiyun add ip, r2, #32 @ 3rd round key 147*4882a593Smuzhiyun do_block dec_dround, dec_fround 148*4882a593SmuzhiyunENDPROC(aes_decrypt) 149*4882a593Smuzhiyun 150*4882a593Smuzhiyun .align 6 151*4882a593Smuzhiyunaes_encrypt_4x: 152*4882a593Smuzhiyun add ip, r2, #32 @ 3rd round key 153*4882a593Smuzhiyun do_block enc_dround_4x, enc_fround_4x 154*4882a593SmuzhiyunENDPROC(aes_encrypt_4x) 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun .align 6 157*4882a593Smuzhiyunaes_decrypt_4x: 158*4882a593Smuzhiyun add ip, r2, #32 @ 3rd round key 159*4882a593Smuzhiyun do_block dec_dround_4x, dec_fround_4x 160*4882a593SmuzhiyunENDPROC(aes_decrypt_4x) 161*4882a593Smuzhiyun 162*4882a593Smuzhiyun .macro prepare_key, rk, rounds 163*4882a593Smuzhiyun add ip, \rk, \rounds, lsl #4 164*4882a593Smuzhiyun vld1.32 {q8-q9}, [\rk] @ load first 2 round keys 165*4882a593Smuzhiyun vld1.32 {q14}, [ip] @ load last round key 166*4882a593Smuzhiyun .endm 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun /* 169*4882a593Smuzhiyun * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 170*4882a593Smuzhiyun * int blocks) 171*4882a593Smuzhiyun * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 172*4882a593Smuzhiyun * int blocks) 173*4882a593Smuzhiyun */ 174*4882a593SmuzhiyunENTRY(ce_aes_ecb_encrypt) 175*4882a593Smuzhiyun push {r4, lr} 176*4882a593Smuzhiyun ldr r4, [sp, #8] 177*4882a593Smuzhiyun prepare_key r2, r3 178*4882a593Smuzhiyun.Lecbencloop4x: 179*4882a593Smuzhiyun subs r4, r4, #4 180*4882a593Smuzhiyun bmi .Lecbenc1x 181*4882a593Smuzhiyun vld1.8 {q0-q1}, [r1]! 182*4882a593Smuzhiyun vld1.8 {q2-q3}, [r1]! 183*4882a593Smuzhiyun bl aes_encrypt_4x 184*4882a593Smuzhiyun vst1.8 {q0-q1}, [r0]! 185*4882a593Smuzhiyun vst1.8 {q2-q3}, [r0]! 186*4882a593Smuzhiyun b .Lecbencloop4x 187*4882a593Smuzhiyun.Lecbenc1x: 188*4882a593Smuzhiyun adds r4, r4, #4 189*4882a593Smuzhiyun beq .Lecbencout 190*4882a593Smuzhiyun.Lecbencloop: 191*4882a593Smuzhiyun vld1.8 {q0}, [r1]! 192*4882a593Smuzhiyun bl aes_encrypt 193*4882a593Smuzhiyun vst1.8 {q0}, [r0]! 194*4882a593Smuzhiyun subs r4, r4, #1 195*4882a593Smuzhiyun bne .Lecbencloop 196*4882a593Smuzhiyun.Lecbencout: 197*4882a593Smuzhiyun pop {r4, pc} 198*4882a593SmuzhiyunENDPROC(ce_aes_ecb_encrypt) 199*4882a593Smuzhiyun 200*4882a593SmuzhiyunENTRY(ce_aes_ecb_decrypt) 201*4882a593Smuzhiyun push {r4, lr} 202*4882a593Smuzhiyun ldr r4, [sp, #8] 203*4882a593Smuzhiyun prepare_key r2, r3 204*4882a593Smuzhiyun.Lecbdecloop4x: 205*4882a593Smuzhiyun subs r4, r4, #4 206*4882a593Smuzhiyun bmi .Lecbdec1x 207*4882a593Smuzhiyun vld1.8 {q0-q1}, [r1]! 208*4882a593Smuzhiyun vld1.8 {q2-q3}, [r1]! 209*4882a593Smuzhiyun bl aes_decrypt_4x 210*4882a593Smuzhiyun vst1.8 {q0-q1}, [r0]! 211*4882a593Smuzhiyun vst1.8 {q2-q3}, [r0]! 212*4882a593Smuzhiyun b .Lecbdecloop4x 213*4882a593Smuzhiyun.Lecbdec1x: 214*4882a593Smuzhiyun adds r4, r4, #4 215*4882a593Smuzhiyun beq .Lecbdecout 216*4882a593Smuzhiyun.Lecbdecloop: 217*4882a593Smuzhiyun vld1.8 {q0}, [r1]! 218*4882a593Smuzhiyun bl aes_decrypt 219*4882a593Smuzhiyun vst1.8 {q0}, [r0]! 220*4882a593Smuzhiyun subs r4, r4, #1 221*4882a593Smuzhiyun bne .Lecbdecloop 222*4882a593Smuzhiyun.Lecbdecout: 223*4882a593Smuzhiyun pop {r4, pc} 224*4882a593SmuzhiyunENDPROC(ce_aes_ecb_decrypt) 225*4882a593Smuzhiyun 226*4882a593Smuzhiyun /* 227*4882a593Smuzhiyun * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 228*4882a593Smuzhiyun * int blocks, u8 iv[]) 229*4882a593Smuzhiyun * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 230*4882a593Smuzhiyun * int blocks, u8 iv[]) 231*4882a593Smuzhiyun */ 232*4882a593SmuzhiyunENTRY(ce_aes_cbc_encrypt) 233*4882a593Smuzhiyun push {r4-r6, lr} 234*4882a593Smuzhiyun ldrd r4, r5, [sp, #16] 235*4882a593Smuzhiyun vld1.8 {q0}, [r5] 236*4882a593Smuzhiyun prepare_key r2, r3 237*4882a593Smuzhiyun.Lcbcencloop: 238*4882a593Smuzhiyun vld1.8 {q1}, [r1]! @ get next pt block 239*4882a593Smuzhiyun veor q0, q0, q1 @ ..and xor with iv 240*4882a593Smuzhiyun bl aes_encrypt 241*4882a593Smuzhiyun vst1.8 {q0}, [r0]! 242*4882a593Smuzhiyun subs r4, r4, #1 243*4882a593Smuzhiyun bne .Lcbcencloop 244*4882a593Smuzhiyun vst1.8 {q0}, [r5] 245*4882a593Smuzhiyun pop {r4-r6, pc} 246*4882a593SmuzhiyunENDPROC(ce_aes_cbc_encrypt) 247*4882a593Smuzhiyun 248*4882a593SmuzhiyunENTRY(ce_aes_cbc_decrypt) 249*4882a593Smuzhiyun push {r4-r6, lr} 250*4882a593Smuzhiyun ldrd r4, r5, [sp, #16] 251*4882a593Smuzhiyun vld1.8 {q15}, [r5] @ keep iv in q15 252*4882a593Smuzhiyun prepare_key r2, r3 253*4882a593Smuzhiyun.Lcbcdecloop4x: 254*4882a593Smuzhiyun subs r4, r4, #4 255*4882a593Smuzhiyun bmi .Lcbcdec1x 256*4882a593Smuzhiyun vld1.8 {q0-q1}, [r1]! 257*4882a593Smuzhiyun vld1.8 {q2-q3}, [r1]! 258*4882a593Smuzhiyun vmov q4, q0 259*4882a593Smuzhiyun vmov q5, q1 260*4882a593Smuzhiyun vmov q6, q2 261*4882a593Smuzhiyun vmov q7, q3 262*4882a593Smuzhiyun bl aes_decrypt_4x 263*4882a593Smuzhiyun veor q0, q0, q15 264*4882a593Smuzhiyun veor q1, q1, q4 265*4882a593Smuzhiyun veor q2, q2, q5 266*4882a593Smuzhiyun veor q3, q3, q6 267*4882a593Smuzhiyun vmov q15, q7 268*4882a593Smuzhiyun vst1.8 {q0-q1}, [r0]! 269*4882a593Smuzhiyun vst1.8 {q2-q3}, [r0]! 270*4882a593Smuzhiyun b .Lcbcdecloop4x 271*4882a593Smuzhiyun.Lcbcdec1x: 272*4882a593Smuzhiyun adds r4, r4, #4 273*4882a593Smuzhiyun beq .Lcbcdecout 274*4882a593Smuzhiyun vmov q6, q14 @ preserve last round key 275*4882a593Smuzhiyun.Lcbcdecloop: 276*4882a593Smuzhiyun vld1.8 {q0}, [r1]! @ get next ct block 277*4882a593Smuzhiyun veor q14, q15, q6 @ combine prev ct with last key 278*4882a593Smuzhiyun vmov q15, q0 279*4882a593Smuzhiyun bl aes_decrypt 280*4882a593Smuzhiyun vst1.8 {q0}, [r0]! 281*4882a593Smuzhiyun subs r4, r4, #1 282*4882a593Smuzhiyun bne .Lcbcdecloop 283*4882a593Smuzhiyun.Lcbcdecout: 284*4882a593Smuzhiyun vst1.8 {q15}, [r5] @ keep iv in q15 285*4882a593Smuzhiyun pop {r4-r6, pc} 286*4882a593SmuzhiyunENDPROC(ce_aes_cbc_decrypt) 287*4882a593Smuzhiyun 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun /* 290*4882a593Smuzhiyun * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], 291*4882a593Smuzhiyun * int rounds, int bytes, u8 const iv[]) 292*4882a593Smuzhiyun * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], 293*4882a593Smuzhiyun * int rounds, int bytes, u8 const iv[]) 294*4882a593Smuzhiyun */ 295*4882a593Smuzhiyun 296*4882a593SmuzhiyunENTRY(ce_aes_cbc_cts_encrypt) 297*4882a593Smuzhiyun push {r4-r6, lr} 298*4882a593Smuzhiyun ldrd r4, r5, [sp, #16] 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun movw ip, :lower16:.Lcts_permute_table 301*4882a593Smuzhiyun movt ip, :upper16:.Lcts_permute_table 302*4882a593Smuzhiyun sub r4, r4, #16 303*4882a593Smuzhiyun add lr, ip, #32 304*4882a593Smuzhiyun add ip, ip, r4 305*4882a593Smuzhiyun sub lr, lr, r4 306*4882a593Smuzhiyun vld1.8 {q5}, [ip] 307*4882a593Smuzhiyun vld1.8 {q6}, [lr] 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun add ip, r1, r4 310*4882a593Smuzhiyun vld1.8 {q0}, [r1] @ overlapping loads 311*4882a593Smuzhiyun vld1.8 {q3}, [ip] 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun vld1.8 {q1}, [r5] @ get iv 314*4882a593Smuzhiyun prepare_key r2, r3 315*4882a593Smuzhiyun 316*4882a593Smuzhiyun veor q0, q0, q1 @ xor with iv 317*4882a593Smuzhiyun bl aes_encrypt 318*4882a593Smuzhiyun 319*4882a593Smuzhiyun vtbl.8 d4, {d0-d1}, d10 320*4882a593Smuzhiyun vtbl.8 d5, {d0-d1}, d11 321*4882a593Smuzhiyun vtbl.8 d2, {d6-d7}, d12 322*4882a593Smuzhiyun vtbl.8 d3, {d6-d7}, d13 323*4882a593Smuzhiyun 324*4882a593Smuzhiyun veor q0, q0, q1 325*4882a593Smuzhiyun bl aes_encrypt 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun add r4, r0, r4 328*4882a593Smuzhiyun vst1.8 {q2}, [r4] @ overlapping stores 329*4882a593Smuzhiyun vst1.8 {q0}, [r0] 330*4882a593Smuzhiyun 331*4882a593Smuzhiyun pop {r4-r6, pc} 332*4882a593SmuzhiyunENDPROC(ce_aes_cbc_cts_encrypt) 333*4882a593Smuzhiyun 334*4882a593SmuzhiyunENTRY(ce_aes_cbc_cts_decrypt) 335*4882a593Smuzhiyun push {r4-r6, lr} 336*4882a593Smuzhiyun ldrd r4, r5, [sp, #16] 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun movw ip, :lower16:.Lcts_permute_table 339*4882a593Smuzhiyun movt ip, :upper16:.Lcts_permute_table 340*4882a593Smuzhiyun sub r4, r4, #16 341*4882a593Smuzhiyun add lr, ip, #32 342*4882a593Smuzhiyun add ip, ip, r4 343*4882a593Smuzhiyun sub lr, lr, r4 344*4882a593Smuzhiyun vld1.8 {q5}, [ip] 345*4882a593Smuzhiyun vld1.8 {q6}, [lr] 346*4882a593Smuzhiyun 347*4882a593Smuzhiyun add ip, r1, r4 348*4882a593Smuzhiyun vld1.8 {q0}, [r1] @ overlapping loads 349*4882a593Smuzhiyun vld1.8 {q1}, [ip] 350*4882a593Smuzhiyun 351*4882a593Smuzhiyun vld1.8 {q3}, [r5] @ get iv 352*4882a593Smuzhiyun prepare_key r2, r3 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun bl aes_decrypt 355*4882a593Smuzhiyun 356*4882a593Smuzhiyun vtbl.8 d4, {d0-d1}, d10 357*4882a593Smuzhiyun vtbl.8 d5, {d0-d1}, d11 358*4882a593Smuzhiyun vtbx.8 d0, {d2-d3}, d12 359*4882a593Smuzhiyun vtbx.8 d1, {d2-d3}, d13 360*4882a593Smuzhiyun 361*4882a593Smuzhiyun veor q1, q1, q2 362*4882a593Smuzhiyun bl aes_decrypt 363*4882a593Smuzhiyun veor q0, q0, q3 @ xor with iv 364*4882a593Smuzhiyun 365*4882a593Smuzhiyun add r4, r0, r4 366*4882a593Smuzhiyun vst1.8 {q1}, [r4] @ overlapping stores 367*4882a593Smuzhiyun vst1.8 {q0}, [r0] 368*4882a593Smuzhiyun 369*4882a593Smuzhiyun pop {r4-r6, pc} 370*4882a593SmuzhiyunENDPROC(ce_aes_cbc_cts_decrypt) 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun 373*4882a593Smuzhiyun /* 374*4882a593Smuzhiyun * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 375*4882a593Smuzhiyun * int blocks, u8 ctr[]) 376*4882a593Smuzhiyun */ 377*4882a593SmuzhiyunENTRY(ce_aes_ctr_encrypt) 378*4882a593Smuzhiyun push {r4-r6, lr} 379*4882a593Smuzhiyun ldrd r4, r5, [sp, #16] 380*4882a593Smuzhiyun vld1.8 {q7}, [r5] @ load ctr 381*4882a593Smuzhiyun prepare_key r2, r3 382*4882a593Smuzhiyun vmov r6, s31 @ keep swabbed ctr in r6 383*4882a593Smuzhiyun rev r6, r6 384*4882a593Smuzhiyun cmn r6, r4 @ 32 bit overflow? 385*4882a593Smuzhiyun bcs .Lctrloop 386*4882a593Smuzhiyun.Lctrloop4x: 387*4882a593Smuzhiyun subs r4, r4, #4 388*4882a593Smuzhiyun bmi .Lctr1x 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun /* 391*4882a593Smuzhiyun * NOTE: the sequence below has been carefully tweaked to avoid 392*4882a593Smuzhiyun * a silicon erratum that exists in Cortex-A57 (#1742098) and 393*4882a593Smuzhiyun * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs 394*4882a593Smuzhiyun * may produce an incorrect result if they take their input from a 395*4882a593Smuzhiyun * register of which a single 32-bit lane has been updated the last 396*4882a593Smuzhiyun * time it was modified. To work around this, the lanes of registers 397*4882a593Smuzhiyun * q0-q3 below are not manipulated individually, and the different 398*4882a593Smuzhiyun * counter values are prepared by successive manipulations of q7. 399*4882a593Smuzhiyun */ 400*4882a593Smuzhiyun add ip, r6, #1 401*4882a593Smuzhiyun vmov q0, q7 402*4882a593Smuzhiyun rev ip, ip 403*4882a593Smuzhiyun add lr, r6, #2 404*4882a593Smuzhiyun vmov s31, ip @ set lane 3 of q1 via q7 405*4882a593Smuzhiyun add ip, r6, #3 406*4882a593Smuzhiyun rev lr, lr 407*4882a593Smuzhiyun vmov q1, q7 408*4882a593Smuzhiyun vmov s31, lr @ set lane 3 of q2 via q7 409*4882a593Smuzhiyun rev ip, ip 410*4882a593Smuzhiyun vmov q2, q7 411*4882a593Smuzhiyun vmov s31, ip @ set lane 3 of q3 via q7 412*4882a593Smuzhiyun add r6, r6, #4 413*4882a593Smuzhiyun vmov q3, q7 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun vld1.8 {q4-q5}, [r1]! 416*4882a593Smuzhiyun vld1.8 {q6}, [r1]! 417*4882a593Smuzhiyun vld1.8 {q15}, [r1]! 418*4882a593Smuzhiyun bl aes_encrypt_4x 419*4882a593Smuzhiyun veor q0, q0, q4 420*4882a593Smuzhiyun veor q1, q1, q5 421*4882a593Smuzhiyun veor q2, q2, q6 422*4882a593Smuzhiyun veor q3, q3, q15 423*4882a593Smuzhiyun rev ip, r6 424*4882a593Smuzhiyun vst1.8 {q0-q1}, [r0]! 425*4882a593Smuzhiyun vst1.8 {q2-q3}, [r0]! 426*4882a593Smuzhiyun vmov s31, ip 427*4882a593Smuzhiyun b .Lctrloop4x 428*4882a593Smuzhiyun.Lctr1x: 429*4882a593Smuzhiyun adds r4, r4, #4 430*4882a593Smuzhiyun beq .Lctrout 431*4882a593Smuzhiyun.Lctrloop: 432*4882a593Smuzhiyun vmov q0, q7 433*4882a593Smuzhiyun bl aes_encrypt 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun adds r6, r6, #1 @ increment BE ctr 436*4882a593Smuzhiyun rev ip, r6 437*4882a593Smuzhiyun vmov s31, ip 438*4882a593Smuzhiyun bcs .Lctrcarry 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun.Lctrcarrydone: 441*4882a593Smuzhiyun subs r4, r4, #1 442*4882a593Smuzhiyun bmi .Lctrtailblock @ blocks < 0 means tail block 443*4882a593Smuzhiyun vld1.8 {q3}, [r1]! 444*4882a593Smuzhiyun veor q3, q0, q3 445*4882a593Smuzhiyun vst1.8 {q3}, [r0]! 446*4882a593Smuzhiyun bne .Lctrloop 447*4882a593Smuzhiyun 448*4882a593Smuzhiyun.Lctrout: 449*4882a593Smuzhiyun vst1.8 {q7}, [r5] @ return next CTR value 450*4882a593Smuzhiyun pop {r4-r6, pc} 451*4882a593Smuzhiyun 452*4882a593Smuzhiyun.Lctrtailblock: 453*4882a593Smuzhiyun vst1.8 {q0}, [r0, :64] @ return the key stream 454*4882a593Smuzhiyun b .Lctrout 455*4882a593Smuzhiyun 456*4882a593Smuzhiyun.Lctrcarry: 457*4882a593Smuzhiyun .irp sreg, s30, s29, s28 458*4882a593Smuzhiyun vmov ip, \sreg @ load next word of ctr 459*4882a593Smuzhiyun rev ip, ip @ ... to handle the carry 460*4882a593Smuzhiyun adds ip, ip, #1 461*4882a593Smuzhiyun rev ip, ip 462*4882a593Smuzhiyun vmov \sreg, ip 463*4882a593Smuzhiyun bcc .Lctrcarrydone 464*4882a593Smuzhiyun .endr 465*4882a593Smuzhiyun b .Lctrcarrydone 466*4882a593SmuzhiyunENDPROC(ce_aes_ctr_encrypt) 467*4882a593Smuzhiyun 468*4882a593Smuzhiyun /* 469*4882a593Smuzhiyun * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, 470*4882a593Smuzhiyun * int bytes, u8 iv[], u32 const rk2[], int first) 471*4882a593Smuzhiyun * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, 472*4882a593Smuzhiyun * int bytes, u8 iv[], u32 const rk2[], int first) 473*4882a593Smuzhiyun */ 474*4882a593Smuzhiyun 475*4882a593Smuzhiyun .macro next_tweak, out, in, const, tmp 476*4882a593Smuzhiyun vshr.s64 \tmp, \in, #63 477*4882a593Smuzhiyun vand \tmp, \tmp, \const 478*4882a593Smuzhiyun vadd.u64 \out, \in, \in 479*4882a593Smuzhiyun vext.8 \tmp, \tmp, \tmp, #8 480*4882a593Smuzhiyun veor \out, \out, \tmp 481*4882a593Smuzhiyun .endm 482*4882a593Smuzhiyun 483*4882a593Smuzhiyunce_aes_xts_init: 484*4882a593Smuzhiyun vmov.i32 d30, #0x87 @ compose tweak mask vector 485*4882a593Smuzhiyun vmovl.u32 q15, d30 486*4882a593Smuzhiyun vshr.u64 d30, d31, #7 487*4882a593Smuzhiyun 488*4882a593Smuzhiyun ldrd r4, r5, [sp, #16] @ load args 489*4882a593Smuzhiyun ldr r6, [sp, #28] 490*4882a593Smuzhiyun vld1.8 {q0}, [r5] @ load iv 491*4882a593Smuzhiyun teq r6, #1 @ start of a block? 492*4882a593Smuzhiyun bxne lr 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun @ Encrypt the IV in q0 with the second AES key. This should only 495*4882a593Smuzhiyun @ be done at the start of a block. 496*4882a593Smuzhiyun ldr r6, [sp, #24] @ load AES key 2 497*4882a593Smuzhiyun prepare_key r6, r3 498*4882a593Smuzhiyun add ip, r6, #32 @ 3rd round key of key 2 499*4882a593Smuzhiyun b .Laes_encrypt_tweak @ tail call 500*4882a593SmuzhiyunENDPROC(ce_aes_xts_init) 501*4882a593Smuzhiyun 502*4882a593SmuzhiyunENTRY(ce_aes_xts_encrypt) 503*4882a593Smuzhiyun push {r4-r6, lr} 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun bl ce_aes_xts_init @ run shared prologue 506*4882a593Smuzhiyun prepare_key r2, r3 507*4882a593Smuzhiyun vmov q4, q0 508*4882a593Smuzhiyun 509*4882a593Smuzhiyun teq r6, #0 @ start of a block? 510*4882a593Smuzhiyun bne .Lxtsenc4x 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun.Lxtsencloop4x: 513*4882a593Smuzhiyun next_tweak q4, q4, q15, q10 514*4882a593Smuzhiyun.Lxtsenc4x: 515*4882a593Smuzhiyun subs r4, r4, #64 516*4882a593Smuzhiyun bmi .Lxtsenc1x 517*4882a593Smuzhiyun vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks 518*4882a593Smuzhiyun vld1.8 {q2-q3}, [r1]! 519*4882a593Smuzhiyun next_tweak q5, q4, q15, q10 520*4882a593Smuzhiyun veor q0, q0, q4 521*4882a593Smuzhiyun next_tweak q6, q5, q15, q10 522*4882a593Smuzhiyun veor q1, q1, q5 523*4882a593Smuzhiyun next_tweak q7, q6, q15, q10 524*4882a593Smuzhiyun veor q2, q2, q6 525*4882a593Smuzhiyun veor q3, q3, q7 526*4882a593Smuzhiyun bl aes_encrypt_4x 527*4882a593Smuzhiyun veor q0, q0, q4 528*4882a593Smuzhiyun veor q1, q1, q5 529*4882a593Smuzhiyun veor q2, q2, q6 530*4882a593Smuzhiyun veor q3, q3, q7 531*4882a593Smuzhiyun vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks 532*4882a593Smuzhiyun vst1.8 {q2-q3}, [r0]! 533*4882a593Smuzhiyun vmov q4, q7 534*4882a593Smuzhiyun teq r4, #0 535*4882a593Smuzhiyun beq .Lxtsencret 536*4882a593Smuzhiyun b .Lxtsencloop4x 537*4882a593Smuzhiyun.Lxtsenc1x: 538*4882a593Smuzhiyun adds r4, r4, #64 539*4882a593Smuzhiyun beq .Lxtsencout 540*4882a593Smuzhiyun subs r4, r4, #16 541*4882a593Smuzhiyun bmi .LxtsencctsNx 542*4882a593Smuzhiyun.Lxtsencloop: 543*4882a593Smuzhiyun vld1.8 {q0}, [r1]! 544*4882a593Smuzhiyun.Lxtsencctsout: 545*4882a593Smuzhiyun veor q0, q0, q4 546*4882a593Smuzhiyun bl aes_encrypt 547*4882a593Smuzhiyun veor q0, q0, q4 548*4882a593Smuzhiyun teq r4, #0 549*4882a593Smuzhiyun beq .Lxtsencout 550*4882a593Smuzhiyun subs r4, r4, #16 551*4882a593Smuzhiyun next_tweak q4, q4, q15, q6 552*4882a593Smuzhiyun bmi .Lxtsenccts 553*4882a593Smuzhiyun vst1.8 {q0}, [r0]! 554*4882a593Smuzhiyun b .Lxtsencloop 555*4882a593Smuzhiyun.Lxtsencout: 556*4882a593Smuzhiyun vst1.8 {q0}, [r0] 557*4882a593Smuzhiyun.Lxtsencret: 558*4882a593Smuzhiyun vst1.8 {q4}, [r5] 559*4882a593Smuzhiyun pop {r4-r6, pc} 560*4882a593Smuzhiyun 561*4882a593Smuzhiyun.LxtsencctsNx: 562*4882a593Smuzhiyun vmov q0, q3 563*4882a593Smuzhiyun sub r0, r0, #16 564*4882a593Smuzhiyun.Lxtsenccts: 565*4882a593Smuzhiyun movw ip, :lower16:.Lcts_permute_table 566*4882a593Smuzhiyun movt ip, :upper16:.Lcts_permute_table 567*4882a593Smuzhiyun 568*4882a593Smuzhiyun add r1, r1, r4 @ rewind input pointer 569*4882a593Smuzhiyun add r4, r4, #16 @ # bytes in final block 570*4882a593Smuzhiyun add lr, ip, #32 571*4882a593Smuzhiyun add ip, ip, r4 572*4882a593Smuzhiyun sub lr, lr, r4 573*4882a593Smuzhiyun add r4, r0, r4 @ output address of final block 574*4882a593Smuzhiyun 575*4882a593Smuzhiyun vld1.8 {q1}, [r1] @ load final partial block 576*4882a593Smuzhiyun vld1.8 {q2}, [ip] 577*4882a593Smuzhiyun vld1.8 {q3}, [lr] 578*4882a593Smuzhiyun 579*4882a593Smuzhiyun vtbl.8 d4, {d0-d1}, d4 580*4882a593Smuzhiyun vtbl.8 d5, {d0-d1}, d5 581*4882a593Smuzhiyun vtbx.8 d0, {d2-d3}, d6 582*4882a593Smuzhiyun vtbx.8 d1, {d2-d3}, d7 583*4882a593Smuzhiyun 584*4882a593Smuzhiyun vst1.8 {q2}, [r4] @ overlapping stores 585*4882a593Smuzhiyun mov r4, #0 586*4882a593Smuzhiyun b .Lxtsencctsout 587*4882a593SmuzhiyunENDPROC(ce_aes_xts_encrypt) 588*4882a593Smuzhiyun 589*4882a593Smuzhiyun 590*4882a593SmuzhiyunENTRY(ce_aes_xts_decrypt) 591*4882a593Smuzhiyun push {r4-r6, lr} 592*4882a593Smuzhiyun 593*4882a593Smuzhiyun bl ce_aes_xts_init @ run shared prologue 594*4882a593Smuzhiyun prepare_key r2, r3 595*4882a593Smuzhiyun vmov q4, q0 596*4882a593Smuzhiyun 597*4882a593Smuzhiyun /* subtract 16 bytes if we are doing CTS */ 598*4882a593Smuzhiyun tst r4, #0xf 599*4882a593Smuzhiyun subne r4, r4, #0x10 600*4882a593Smuzhiyun 601*4882a593Smuzhiyun teq r6, #0 @ start of a block? 602*4882a593Smuzhiyun bne .Lxtsdec4x 603*4882a593Smuzhiyun 604*4882a593Smuzhiyun.Lxtsdecloop4x: 605*4882a593Smuzhiyun next_tweak q4, q4, q15, q10 606*4882a593Smuzhiyun.Lxtsdec4x: 607*4882a593Smuzhiyun subs r4, r4, #64 608*4882a593Smuzhiyun bmi .Lxtsdec1x 609*4882a593Smuzhiyun vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks 610*4882a593Smuzhiyun vld1.8 {q2-q3}, [r1]! 611*4882a593Smuzhiyun next_tweak q5, q4, q15, q10 612*4882a593Smuzhiyun veor q0, q0, q4 613*4882a593Smuzhiyun next_tweak q6, q5, q15, q10 614*4882a593Smuzhiyun veor q1, q1, q5 615*4882a593Smuzhiyun next_tweak q7, q6, q15, q10 616*4882a593Smuzhiyun veor q2, q2, q6 617*4882a593Smuzhiyun veor q3, q3, q7 618*4882a593Smuzhiyun bl aes_decrypt_4x 619*4882a593Smuzhiyun veor q0, q0, q4 620*4882a593Smuzhiyun veor q1, q1, q5 621*4882a593Smuzhiyun veor q2, q2, q6 622*4882a593Smuzhiyun veor q3, q3, q7 623*4882a593Smuzhiyun vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks 624*4882a593Smuzhiyun vst1.8 {q2-q3}, [r0]! 625*4882a593Smuzhiyun vmov q4, q7 626*4882a593Smuzhiyun teq r4, #0 627*4882a593Smuzhiyun beq .Lxtsdecout 628*4882a593Smuzhiyun b .Lxtsdecloop4x 629*4882a593Smuzhiyun.Lxtsdec1x: 630*4882a593Smuzhiyun adds r4, r4, #64 631*4882a593Smuzhiyun beq .Lxtsdecout 632*4882a593Smuzhiyun subs r4, r4, #16 633*4882a593Smuzhiyun.Lxtsdecloop: 634*4882a593Smuzhiyun vld1.8 {q0}, [r1]! 635*4882a593Smuzhiyun bmi .Lxtsdeccts 636*4882a593Smuzhiyun.Lxtsdecctsout: 637*4882a593Smuzhiyun veor q0, q0, q4 638*4882a593Smuzhiyun bl aes_decrypt 639*4882a593Smuzhiyun veor q0, q0, q4 640*4882a593Smuzhiyun vst1.8 {q0}, [r0]! 641*4882a593Smuzhiyun teq r4, #0 642*4882a593Smuzhiyun beq .Lxtsdecout 643*4882a593Smuzhiyun subs r4, r4, #16 644*4882a593Smuzhiyun next_tweak q4, q4, q15, q6 645*4882a593Smuzhiyun b .Lxtsdecloop 646*4882a593Smuzhiyun.Lxtsdecout: 647*4882a593Smuzhiyun vst1.8 {q4}, [r5] 648*4882a593Smuzhiyun pop {r4-r6, pc} 649*4882a593Smuzhiyun 650*4882a593Smuzhiyun.Lxtsdeccts: 651*4882a593Smuzhiyun movw ip, :lower16:.Lcts_permute_table 652*4882a593Smuzhiyun movt ip, :upper16:.Lcts_permute_table 653*4882a593Smuzhiyun 654*4882a593Smuzhiyun add r1, r1, r4 @ rewind input pointer 655*4882a593Smuzhiyun add r4, r4, #16 @ # bytes in final block 656*4882a593Smuzhiyun add lr, ip, #32 657*4882a593Smuzhiyun add ip, ip, r4 658*4882a593Smuzhiyun sub lr, lr, r4 659*4882a593Smuzhiyun add r4, r0, r4 @ output address of final block 660*4882a593Smuzhiyun 661*4882a593Smuzhiyun next_tweak q5, q4, q15, q6 662*4882a593Smuzhiyun 663*4882a593Smuzhiyun vld1.8 {q1}, [r1] @ load final partial block 664*4882a593Smuzhiyun vld1.8 {q2}, [ip] 665*4882a593Smuzhiyun vld1.8 {q3}, [lr] 666*4882a593Smuzhiyun 667*4882a593Smuzhiyun veor q0, q0, q5 668*4882a593Smuzhiyun bl aes_decrypt 669*4882a593Smuzhiyun veor q0, q0, q5 670*4882a593Smuzhiyun 671*4882a593Smuzhiyun vtbl.8 d4, {d0-d1}, d4 672*4882a593Smuzhiyun vtbl.8 d5, {d0-d1}, d5 673*4882a593Smuzhiyun vtbx.8 d0, {d2-d3}, d6 674*4882a593Smuzhiyun vtbx.8 d1, {d2-d3}, d7 675*4882a593Smuzhiyun 676*4882a593Smuzhiyun vst1.8 {q2}, [r4] @ overlapping stores 677*4882a593Smuzhiyun mov r4, #0 678*4882a593Smuzhiyun b .Lxtsdecctsout 679*4882a593SmuzhiyunENDPROC(ce_aes_xts_decrypt) 680*4882a593Smuzhiyun 681*4882a593Smuzhiyun /* 682*4882a593Smuzhiyun * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 683*4882a593Smuzhiyun * AES sbox substitution on each byte in 684*4882a593Smuzhiyun * 'input' 685*4882a593Smuzhiyun */ 686*4882a593SmuzhiyunENTRY(ce_aes_sub) 687*4882a593Smuzhiyun vdup.32 q1, r0 688*4882a593Smuzhiyun veor q0, q0, q0 689*4882a593Smuzhiyun aese.8 q0, q1 690*4882a593Smuzhiyun vmov r0, s0 691*4882a593Smuzhiyun bx lr 692*4882a593SmuzhiyunENDPROC(ce_aes_sub) 693*4882a593Smuzhiyun 694*4882a593Smuzhiyun /* 695*4882a593Smuzhiyun * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns 696*4882a593Smuzhiyun * operation on round key *src 697*4882a593Smuzhiyun */ 698*4882a593SmuzhiyunENTRY(ce_aes_invert) 699*4882a593Smuzhiyun vld1.32 {q0}, [r1] 700*4882a593Smuzhiyun aesimc.8 q0, q0 701*4882a593Smuzhiyun vst1.32 {q0}, [r0] 702*4882a593Smuzhiyun bx lr 703*4882a593SmuzhiyunENDPROC(ce_aes_invert) 704*4882a593Smuzhiyun 705*4882a593Smuzhiyun .section ".rodata", "a" 706*4882a593Smuzhiyun .align 6 707*4882a593Smuzhiyun.Lcts_permute_table: 708*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 709*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 710*4882a593Smuzhiyun .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 711*4882a593Smuzhiyun .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 712*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 713*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 714