1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun/* included by aes-ce.S and aes-neon.S */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun .text 11*4882a593Smuzhiyun .align 4 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#ifndef MAX_STRIDE 14*4882a593Smuzhiyun#define MAX_STRIDE 4 15*4882a593Smuzhiyun#endif 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun#if MAX_STRIDE == 4 18*4882a593Smuzhiyun#define ST4(x...) x 19*4882a593Smuzhiyun#define ST5(x...) 20*4882a593Smuzhiyun#else 21*4882a593Smuzhiyun#define ST4(x...) 22*4882a593Smuzhiyun#define ST5(x...) x 23*4882a593Smuzhiyun#endif 24*4882a593Smuzhiyun 25*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_encrypt_block4x) 26*4882a593Smuzhiyun encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 27*4882a593Smuzhiyun ret 28*4882a593SmuzhiyunSYM_FUNC_END(aes_encrypt_block4x) 29*4882a593Smuzhiyun 30*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_decrypt_block4x) 31*4882a593Smuzhiyun decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 32*4882a593Smuzhiyun ret 33*4882a593SmuzhiyunSYM_FUNC_END(aes_decrypt_block4x) 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#if MAX_STRIDE == 5 36*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_encrypt_block5x) 37*4882a593Smuzhiyun encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 38*4882a593Smuzhiyun ret 39*4882a593SmuzhiyunSYM_FUNC_END(aes_encrypt_block5x) 40*4882a593Smuzhiyun 41*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(aes_decrypt_block5x) 42*4882a593Smuzhiyun decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 43*4882a593Smuzhiyun ret 44*4882a593SmuzhiyunSYM_FUNC_END(aes_decrypt_block5x) 45*4882a593Smuzhiyun#endif 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun /* 48*4882a593Smuzhiyun * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 49*4882a593Smuzhiyun * int blocks) 50*4882a593Smuzhiyun * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 51*4882a593Smuzhiyun * int blocks) 52*4882a593Smuzhiyun */ 53*4882a593Smuzhiyun 54*4882a593SmuzhiyunAES_FUNC_START(aes_ecb_encrypt) 55*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 56*4882a593Smuzhiyun mov x29, sp 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun enc_prepare w3, x2, x5 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun.LecbencloopNx: 61*4882a593Smuzhiyun subs w4, w4, #MAX_STRIDE 62*4882a593Smuzhiyun bmi .Lecbenc1x 63*4882a593Smuzhiyun ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 64*4882a593SmuzhiyunST4( bl aes_encrypt_block4x ) 65*4882a593SmuzhiyunST5( ld1 {v4.16b}, [x1], #16 ) 66*4882a593SmuzhiyunST5( bl aes_encrypt_block5x ) 67*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 68*4882a593SmuzhiyunST5( st1 {v4.16b}, [x0], #16 ) 69*4882a593Smuzhiyun b .LecbencloopNx 70*4882a593Smuzhiyun.Lecbenc1x: 71*4882a593Smuzhiyun adds w4, w4, #MAX_STRIDE 72*4882a593Smuzhiyun beq .Lecbencout 73*4882a593Smuzhiyun.Lecbencloop: 74*4882a593Smuzhiyun ld1 {v0.16b}, [x1], #16 /* get next pt block */ 75*4882a593Smuzhiyun encrypt_block v0, w3, x2, x5, w6 76*4882a593Smuzhiyun st1 {v0.16b}, [x0], #16 77*4882a593Smuzhiyun subs w4, w4, #1 78*4882a593Smuzhiyun bne .Lecbencloop 79*4882a593Smuzhiyun.Lecbencout: 80*4882a593Smuzhiyun ldp x29, x30, [sp], #16 81*4882a593Smuzhiyun ret 82*4882a593SmuzhiyunAES_FUNC_END(aes_ecb_encrypt) 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun 85*4882a593SmuzhiyunAES_FUNC_START(aes_ecb_decrypt) 86*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 87*4882a593Smuzhiyun mov x29, sp 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun dec_prepare w3, x2, x5 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun.LecbdecloopNx: 92*4882a593Smuzhiyun subs w4, w4, #MAX_STRIDE 93*4882a593Smuzhiyun bmi .Lecbdec1x 94*4882a593Smuzhiyun ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 95*4882a593SmuzhiyunST4( bl aes_decrypt_block4x ) 96*4882a593SmuzhiyunST5( ld1 {v4.16b}, [x1], #16 ) 97*4882a593SmuzhiyunST5( bl aes_decrypt_block5x ) 98*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 99*4882a593SmuzhiyunST5( st1 {v4.16b}, [x0], #16 ) 100*4882a593Smuzhiyun b .LecbdecloopNx 101*4882a593Smuzhiyun.Lecbdec1x: 102*4882a593Smuzhiyun adds w4, w4, #MAX_STRIDE 103*4882a593Smuzhiyun beq .Lecbdecout 104*4882a593Smuzhiyun.Lecbdecloop: 105*4882a593Smuzhiyun ld1 {v0.16b}, [x1], #16 /* get next ct block */ 106*4882a593Smuzhiyun decrypt_block v0, w3, x2, x5, w6 107*4882a593Smuzhiyun st1 {v0.16b}, [x0], #16 108*4882a593Smuzhiyun subs w4, w4, #1 109*4882a593Smuzhiyun bne .Lecbdecloop 110*4882a593Smuzhiyun.Lecbdecout: 111*4882a593Smuzhiyun ldp x29, x30, [sp], #16 112*4882a593Smuzhiyun ret 113*4882a593SmuzhiyunAES_FUNC_END(aes_ecb_decrypt) 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun /* 117*4882a593Smuzhiyun * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 118*4882a593Smuzhiyun * int blocks, u8 iv[]) 119*4882a593Smuzhiyun * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 120*4882a593Smuzhiyun * int blocks, u8 iv[]) 121*4882a593Smuzhiyun * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], 122*4882a593Smuzhiyun * int rounds, int blocks, u8 iv[], 123*4882a593Smuzhiyun * u32 const rk2[]); 124*4882a593Smuzhiyun * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], 125*4882a593Smuzhiyun * int rounds, int blocks, u8 iv[], 126*4882a593Smuzhiyun * u32 const rk2[]); 127*4882a593Smuzhiyun */ 128*4882a593Smuzhiyun 129*4882a593SmuzhiyunAES_FUNC_START(aes_essiv_cbc_encrypt) 130*4882a593Smuzhiyun ld1 {v4.16b}, [x5] /* get iv */ 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun mov w8, #14 /* AES-256: 14 rounds */ 133*4882a593Smuzhiyun enc_prepare w8, x6, x7 134*4882a593Smuzhiyun encrypt_block v4, w8, x6, x7, w9 135*4882a593Smuzhiyun enc_switch_key w3, x2, x6 136*4882a593Smuzhiyun b .Lcbcencloop4x 137*4882a593Smuzhiyun 138*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_encrypt) 139*4882a593Smuzhiyun ld1 {v4.16b}, [x5] /* get iv */ 140*4882a593Smuzhiyun enc_prepare w3, x2, x6 141*4882a593Smuzhiyun 142*4882a593Smuzhiyun.Lcbcencloop4x: 143*4882a593Smuzhiyun subs w4, w4, #4 144*4882a593Smuzhiyun bmi .Lcbcenc1x 145*4882a593Smuzhiyun ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 146*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ 147*4882a593Smuzhiyun encrypt_block v0, w3, x2, x6, w7 148*4882a593Smuzhiyun eor v1.16b, v1.16b, v0.16b 149*4882a593Smuzhiyun encrypt_block v1, w3, x2, x6, w7 150*4882a593Smuzhiyun eor v2.16b, v2.16b, v1.16b 151*4882a593Smuzhiyun encrypt_block v2, w3, x2, x6, w7 152*4882a593Smuzhiyun eor v3.16b, v3.16b, v2.16b 153*4882a593Smuzhiyun encrypt_block v3, w3, x2, x6, w7 154*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 155*4882a593Smuzhiyun mov v4.16b, v3.16b 156*4882a593Smuzhiyun b .Lcbcencloop4x 157*4882a593Smuzhiyun.Lcbcenc1x: 158*4882a593Smuzhiyun adds w4, w4, #4 159*4882a593Smuzhiyun beq .Lcbcencout 160*4882a593Smuzhiyun.Lcbcencloop: 161*4882a593Smuzhiyun ld1 {v0.16b}, [x1], #16 /* get next pt block */ 162*4882a593Smuzhiyun eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ 163*4882a593Smuzhiyun encrypt_block v4, w3, x2, x6, w7 164*4882a593Smuzhiyun st1 {v4.16b}, [x0], #16 165*4882a593Smuzhiyun subs w4, w4, #1 166*4882a593Smuzhiyun bne .Lcbcencloop 167*4882a593Smuzhiyun.Lcbcencout: 168*4882a593Smuzhiyun st1 {v4.16b}, [x5] /* return iv */ 169*4882a593Smuzhiyun ret 170*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_encrypt) 171*4882a593SmuzhiyunAES_FUNC_END(aes_essiv_cbc_encrypt) 172*4882a593Smuzhiyun 173*4882a593SmuzhiyunAES_FUNC_START(aes_essiv_cbc_decrypt) 174*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 175*4882a593Smuzhiyun mov x29, sp 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun ld1 {cbciv.16b}, [x5] /* get iv */ 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun mov w8, #14 /* AES-256: 14 rounds */ 180*4882a593Smuzhiyun enc_prepare w8, x6, x7 181*4882a593Smuzhiyun encrypt_block cbciv, w8, x6, x7, w9 182*4882a593Smuzhiyun b .Lessivcbcdecstart 183*4882a593Smuzhiyun 184*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_decrypt) 185*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 186*4882a593Smuzhiyun mov x29, sp 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun ld1 {cbciv.16b}, [x5] /* get iv */ 189*4882a593Smuzhiyun.Lessivcbcdecstart: 190*4882a593Smuzhiyun dec_prepare w3, x2, x6 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun.LcbcdecloopNx: 193*4882a593Smuzhiyun subs w4, w4, #MAX_STRIDE 194*4882a593Smuzhiyun bmi .Lcbcdec1x 195*4882a593Smuzhiyun ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 196*4882a593Smuzhiyun#if MAX_STRIDE == 5 197*4882a593Smuzhiyun ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ 198*4882a593Smuzhiyun mov v5.16b, v0.16b 199*4882a593Smuzhiyun mov v6.16b, v1.16b 200*4882a593Smuzhiyun mov v7.16b, v2.16b 201*4882a593Smuzhiyun bl aes_decrypt_block5x 202*4882a593Smuzhiyun sub x1, x1, #32 203*4882a593Smuzhiyun eor v0.16b, v0.16b, cbciv.16b 204*4882a593Smuzhiyun eor v1.16b, v1.16b, v5.16b 205*4882a593Smuzhiyun ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ 206*4882a593Smuzhiyun ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ 207*4882a593Smuzhiyun eor v2.16b, v2.16b, v6.16b 208*4882a593Smuzhiyun eor v3.16b, v3.16b, v7.16b 209*4882a593Smuzhiyun eor v4.16b, v4.16b, v5.16b 210*4882a593Smuzhiyun#else 211*4882a593Smuzhiyun mov v4.16b, v0.16b 212*4882a593Smuzhiyun mov v5.16b, v1.16b 213*4882a593Smuzhiyun mov v6.16b, v2.16b 214*4882a593Smuzhiyun bl aes_decrypt_block4x 215*4882a593Smuzhiyun sub x1, x1, #16 216*4882a593Smuzhiyun eor v0.16b, v0.16b, cbciv.16b 217*4882a593Smuzhiyun eor v1.16b, v1.16b, v4.16b 218*4882a593Smuzhiyun ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ 219*4882a593Smuzhiyun eor v2.16b, v2.16b, v5.16b 220*4882a593Smuzhiyun eor v3.16b, v3.16b, v6.16b 221*4882a593Smuzhiyun#endif 222*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 223*4882a593SmuzhiyunST5( st1 {v4.16b}, [x0], #16 ) 224*4882a593Smuzhiyun b .LcbcdecloopNx 225*4882a593Smuzhiyun.Lcbcdec1x: 226*4882a593Smuzhiyun adds w4, w4, #MAX_STRIDE 227*4882a593Smuzhiyun beq .Lcbcdecout 228*4882a593Smuzhiyun.Lcbcdecloop: 229*4882a593Smuzhiyun ld1 {v1.16b}, [x1], #16 /* get next ct block */ 230*4882a593Smuzhiyun mov v0.16b, v1.16b /* ...and copy to v0 */ 231*4882a593Smuzhiyun decrypt_block v0, w3, x2, x6, w7 232*4882a593Smuzhiyun eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ 233*4882a593Smuzhiyun mov cbciv.16b, v1.16b /* ct is next iv */ 234*4882a593Smuzhiyun st1 {v0.16b}, [x0], #16 235*4882a593Smuzhiyun subs w4, w4, #1 236*4882a593Smuzhiyun bne .Lcbcdecloop 237*4882a593Smuzhiyun.Lcbcdecout: 238*4882a593Smuzhiyun st1 {cbciv.16b}, [x5] /* return iv */ 239*4882a593Smuzhiyun ldp x29, x30, [sp], #16 240*4882a593Smuzhiyun ret 241*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_decrypt) 242*4882a593SmuzhiyunAES_FUNC_END(aes_essiv_cbc_decrypt) 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun /* 246*4882a593Smuzhiyun * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], 247*4882a593Smuzhiyun * int rounds, int bytes, u8 const iv[]) 248*4882a593Smuzhiyun * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], 249*4882a593Smuzhiyun * int rounds, int bytes, u8 const iv[]) 250*4882a593Smuzhiyun */ 251*4882a593Smuzhiyun 252*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_cts_encrypt) 253*4882a593Smuzhiyun adr_l x8, .Lcts_permute_table 254*4882a593Smuzhiyun sub x4, x4, #16 255*4882a593Smuzhiyun add x9, x8, #32 256*4882a593Smuzhiyun add x8, x8, x4 257*4882a593Smuzhiyun sub x9, x9, x4 258*4882a593Smuzhiyun ld1 {v3.16b}, [x8] 259*4882a593Smuzhiyun ld1 {v4.16b}, [x9] 260*4882a593Smuzhiyun 261*4882a593Smuzhiyun ld1 {v0.16b}, [x1], x4 /* overlapping loads */ 262*4882a593Smuzhiyun ld1 {v1.16b}, [x1] 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun ld1 {v5.16b}, [x5] /* get iv */ 265*4882a593Smuzhiyun enc_prepare w3, x2, x6 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun eor v0.16b, v0.16b, v5.16b /* xor with iv */ 268*4882a593Smuzhiyun tbl v1.16b, {v1.16b}, v4.16b 269*4882a593Smuzhiyun encrypt_block v0, w3, x2, x6, w7 270*4882a593Smuzhiyun 271*4882a593Smuzhiyun eor v1.16b, v1.16b, v0.16b 272*4882a593Smuzhiyun tbl v0.16b, {v0.16b}, v3.16b 273*4882a593Smuzhiyun encrypt_block v1, w3, x2, x6, w7 274*4882a593Smuzhiyun 275*4882a593Smuzhiyun add x4, x0, x4 276*4882a593Smuzhiyun st1 {v0.16b}, [x4] /* overlapping stores */ 277*4882a593Smuzhiyun st1 {v1.16b}, [x0] 278*4882a593Smuzhiyun ret 279*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_cts_encrypt) 280*4882a593Smuzhiyun 281*4882a593SmuzhiyunAES_FUNC_START(aes_cbc_cts_decrypt) 282*4882a593Smuzhiyun adr_l x8, .Lcts_permute_table 283*4882a593Smuzhiyun sub x4, x4, #16 284*4882a593Smuzhiyun add x9, x8, #32 285*4882a593Smuzhiyun add x8, x8, x4 286*4882a593Smuzhiyun sub x9, x9, x4 287*4882a593Smuzhiyun ld1 {v3.16b}, [x8] 288*4882a593Smuzhiyun ld1 {v4.16b}, [x9] 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun ld1 {v0.16b}, [x1], x4 /* overlapping loads */ 291*4882a593Smuzhiyun ld1 {v1.16b}, [x1] 292*4882a593Smuzhiyun 293*4882a593Smuzhiyun ld1 {v5.16b}, [x5] /* get iv */ 294*4882a593Smuzhiyun dec_prepare w3, x2, x6 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun decrypt_block v0, w3, x2, x6, w7 297*4882a593Smuzhiyun tbl v2.16b, {v0.16b}, v3.16b 298*4882a593Smuzhiyun eor v2.16b, v2.16b, v1.16b 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun tbx v0.16b, {v1.16b}, v4.16b 301*4882a593Smuzhiyun decrypt_block v0, w3, x2, x6, w7 302*4882a593Smuzhiyun eor v0.16b, v0.16b, v5.16b /* xor with iv */ 303*4882a593Smuzhiyun 304*4882a593Smuzhiyun add x4, x0, x4 305*4882a593Smuzhiyun st1 {v2.16b}, [x4] /* overlapping stores */ 306*4882a593Smuzhiyun st1 {v0.16b}, [x0] 307*4882a593Smuzhiyun ret 308*4882a593SmuzhiyunAES_FUNC_END(aes_cbc_cts_decrypt) 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun .section ".rodata", "a" 311*4882a593Smuzhiyun .align 6 312*4882a593Smuzhiyun.Lcts_permute_table: 313*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 314*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 315*4882a593Smuzhiyun .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 316*4882a593Smuzhiyun .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 317*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 318*4882a593Smuzhiyun .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 319*4882a593Smuzhiyun .previous 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun /* 323*4882a593Smuzhiyun * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 324*4882a593Smuzhiyun * int blocks, u8 ctr[]) 325*4882a593Smuzhiyun */ 326*4882a593Smuzhiyun 327*4882a593SmuzhiyunAES_FUNC_START(aes_ctr_encrypt) 328*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 329*4882a593Smuzhiyun mov x29, sp 330*4882a593Smuzhiyun 331*4882a593Smuzhiyun enc_prepare w3, x2, x6 332*4882a593Smuzhiyun ld1 {vctr.16b}, [x5] 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun umov x6, vctr.d[1] /* keep swabbed ctr in reg */ 335*4882a593Smuzhiyun rev x6, x6 336*4882a593Smuzhiyun cmn w6, w4 /* 32 bit overflow? */ 337*4882a593Smuzhiyun bcs .Lctrloop 338*4882a593Smuzhiyun.LctrloopNx: 339*4882a593Smuzhiyun subs w4, w4, #MAX_STRIDE 340*4882a593Smuzhiyun bmi .Lctr1x 341*4882a593Smuzhiyun add w7, w6, #1 342*4882a593Smuzhiyun mov v0.16b, vctr.16b 343*4882a593Smuzhiyun add w8, w6, #2 344*4882a593Smuzhiyun mov v1.16b, vctr.16b 345*4882a593Smuzhiyun add w9, w6, #3 346*4882a593Smuzhiyun mov v2.16b, vctr.16b 347*4882a593Smuzhiyun add w9, w6, #3 348*4882a593Smuzhiyun rev w7, w7 349*4882a593Smuzhiyun mov v3.16b, vctr.16b 350*4882a593Smuzhiyun rev w8, w8 351*4882a593SmuzhiyunST5( mov v4.16b, vctr.16b ) 352*4882a593Smuzhiyun mov v1.s[3], w7 353*4882a593Smuzhiyun rev w9, w9 354*4882a593SmuzhiyunST5( add w10, w6, #4 ) 355*4882a593Smuzhiyun mov v2.s[3], w8 356*4882a593SmuzhiyunST5( rev w10, w10 ) 357*4882a593Smuzhiyun mov v3.s[3], w9 358*4882a593SmuzhiyunST5( mov v4.s[3], w10 ) 359*4882a593Smuzhiyun ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 360*4882a593SmuzhiyunST4( bl aes_encrypt_block4x ) 361*4882a593SmuzhiyunST5( bl aes_encrypt_block5x ) 362*4882a593Smuzhiyun eor v0.16b, v5.16b, v0.16b 363*4882a593SmuzhiyunST4( ld1 {v5.16b}, [x1], #16 ) 364*4882a593Smuzhiyun eor v1.16b, v6.16b, v1.16b 365*4882a593SmuzhiyunST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) 366*4882a593Smuzhiyun eor v2.16b, v7.16b, v2.16b 367*4882a593Smuzhiyun eor v3.16b, v5.16b, v3.16b 368*4882a593SmuzhiyunST5( eor v4.16b, v6.16b, v4.16b ) 369*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 370*4882a593SmuzhiyunST5( st1 {v4.16b}, [x0], #16 ) 371*4882a593Smuzhiyun add x6, x6, #MAX_STRIDE 372*4882a593Smuzhiyun rev x7, x6 373*4882a593Smuzhiyun ins vctr.d[1], x7 374*4882a593Smuzhiyun cbz w4, .Lctrout 375*4882a593Smuzhiyun b .LctrloopNx 376*4882a593Smuzhiyun.Lctr1x: 377*4882a593Smuzhiyun adds w4, w4, #MAX_STRIDE 378*4882a593Smuzhiyun beq .Lctrout 379*4882a593Smuzhiyun.Lctrloop: 380*4882a593Smuzhiyun mov v0.16b, vctr.16b 381*4882a593Smuzhiyun encrypt_block v0, w3, x2, x8, w7 382*4882a593Smuzhiyun 383*4882a593Smuzhiyun adds x6, x6, #1 /* increment BE ctr */ 384*4882a593Smuzhiyun rev x7, x6 385*4882a593Smuzhiyun ins vctr.d[1], x7 386*4882a593Smuzhiyun bcs .Lctrcarry /* overflow? */ 387*4882a593Smuzhiyun 388*4882a593Smuzhiyun.Lctrcarrydone: 389*4882a593Smuzhiyun subs w4, w4, #1 390*4882a593Smuzhiyun bmi .Lctrtailblock /* blocks <0 means tail block */ 391*4882a593Smuzhiyun ld1 {v3.16b}, [x1], #16 392*4882a593Smuzhiyun eor v3.16b, v0.16b, v3.16b 393*4882a593Smuzhiyun st1 {v3.16b}, [x0], #16 394*4882a593Smuzhiyun bne .Lctrloop 395*4882a593Smuzhiyun 396*4882a593Smuzhiyun.Lctrout: 397*4882a593Smuzhiyun st1 {vctr.16b}, [x5] /* return next CTR value */ 398*4882a593Smuzhiyun ldp x29, x30, [sp], #16 399*4882a593Smuzhiyun ret 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun.Lctrtailblock: 402*4882a593Smuzhiyun st1 {v0.16b}, [x0] 403*4882a593Smuzhiyun b .Lctrout 404*4882a593Smuzhiyun 405*4882a593Smuzhiyun.Lctrcarry: 406*4882a593Smuzhiyun umov x7, vctr.d[0] /* load upper word of ctr */ 407*4882a593Smuzhiyun rev x7, x7 /* ... to handle the carry */ 408*4882a593Smuzhiyun add x7, x7, #1 409*4882a593Smuzhiyun rev x7, x7 410*4882a593Smuzhiyun ins vctr.d[0], x7 411*4882a593Smuzhiyun b .Lctrcarrydone 412*4882a593SmuzhiyunAES_FUNC_END(aes_ctr_encrypt) 413*4882a593Smuzhiyun 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun /* 416*4882a593Smuzhiyun * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 417*4882a593Smuzhiyun * int bytes, u8 const rk2[], u8 iv[], int first) 418*4882a593Smuzhiyun * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 419*4882a593Smuzhiyun * int bytes, u8 const rk2[], u8 iv[], int first) 420*4882a593Smuzhiyun */ 421*4882a593Smuzhiyun 422*4882a593Smuzhiyun .macro next_tweak, out, in, tmp 423*4882a593Smuzhiyun sshr \tmp\().2d, \in\().2d, #63 424*4882a593Smuzhiyun and \tmp\().16b, \tmp\().16b, xtsmask.16b 425*4882a593Smuzhiyun add \out\().2d, \in\().2d, \in\().2d 426*4882a593Smuzhiyun ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 427*4882a593Smuzhiyun eor \out\().16b, \out\().16b, \tmp\().16b 428*4882a593Smuzhiyun .endm 429*4882a593Smuzhiyun 430*4882a593Smuzhiyun .macro xts_load_mask, tmp 431*4882a593Smuzhiyun movi xtsmask.2s, #0x1 432*4882a593Smuzhiyun movi \tmp\().2s, #0x87 433*4882a593Smuzhiyun uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s 434*4882a593Smuzhiyun .endm 435*4882a593Smuzhiyun 436*4882a593SmuzhiyunAES_FUNC_START(aes_xts_encrypt) 437*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 438*4882a593Smuzhiyun mov x29, sp 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun ld1 {v4.16b}, [x6] 441*4882a593Smuzhiyun xts_load_mask v8 442*4882a593Smuzhiyun cbz w7, .Lxtsencnotfirst 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun enc_prepare w3, x5, x8 445*4882a593Smuzhiyun xts_cts_skip_tw w7, .LxtsencNx 446*4882a593Smuzhiyun encrypt_block v4, w3, x5, x8, w7 /* first tweak */ 447*4882a593Smuzhiyun enc_switch_key w3, x2, x8 448*4882a593Smuzhiyun b .LxtsencNx 449*4882a593Smuzhiyun 450*4882a593Smuzhiyun.Lxtsencnotfirst: 451*4882a593Smuzhiyun enc_prepare w3, x2, x8 452*4882a593Smuzhiyun.LxtsencloopNx: 453*4882a593Smuzhiyun next_tweak v4, v4, v8 454*4882a593Smuzhiyun.LxtsencNx: 455*4882a593Smuzhiyun subs w4, w4, #64 456*4882a593Smuzhiyun bmi .Lxtsenc1x 457*4882a593Smuzhiyun ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 458*4882a593Smuzhiyun next_tweak v5, v4, v8 459*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 460*4882a593Smuzhiyun next_tweak v6, v5, v8 461*4882a593Smuzhiyun eor v1.16b, v1.16b, v5.16b 462*4882a593Smuzhiyun eor v2.16b, v2.16b, v6.16b 463*4882a593Smuzhiyun next_tweak v7, v6, v8 464*4882a593Smuzhiyun eor v3.16b, v3.16b, v7.16b 465*4882a593Smuzhiyun bl aes_encrypt_block4x 466*4882a593Smuzhiyun eor v3.16b, v3.16b, v7.16b 467*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 468*4882a593Smuzhiyun eor v1.16b, v1.16b, v5.16b 469*4882a593Smuzhiyun eor v2.16b, v2.16b, v6.16b 470*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 471*4882a593Smuzhiyun mov v4.16b, v7.16b 472*4882a593Smuzhiyun cbz w4, .Lxtsencret 473*4882a593Smuzhiyun xts_reload_mask v8 474*4882a593Smuzhiyun b .LxtsencloopNx 475*4882a593Smuzhiyun.Lxtsenc1x: 476*4882a593Smuzhiyun adds w4, w4, #64 477*4882a593Smuzhiyun beq .Lxtsencout 478*4882a593Smuzhiyun subs w4, w4, #16 479*4882a593Smuzhiyun bmi .LxtsencctsNx 480*4882a593Smuzhiyun.Lxtsencloop: 481*4882a593Smuzhiyun ld1 {v0.16b}, [x1], #16 482*4882a593Smuzhiyun.Lxtsencctsout: 483*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 484*4882a593Smuzhiyun encrypt_block v0, w3, x2, x8, w7 485*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 486*4882a593Smuzhiyun cbz w4, .Lxtsencout 487*4882a593Smuzhiyun subs w4, w4, #16 488*4882a593Smuzhiyun next_tweak v4, v4, v8 489*4882a593Smuzhiyun bmi .Lxtsenccts 490*4882a593Smuzhiyun st1 {v0.16b}, [x0], #16 491*4882a593Smuzhiyun b .Lxtsencloop 492*4882a593Smuzhiyun.Lxtsencout: 493*4882a593Smuzhiyun st1 {v0.16b}, [x0] 494*4882a593Smuzhiyun.Lxtsencret: 495*4882a593Smuzhiyun st1 {v4.16b}, [x6] 496*4882a593Smuzhiyun ldp x29, x30, [sp], #16 497*4882a593Smuzhiyun ret 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun.LxtsencctsNx: 500*4882a593Smuzhiyun mov v0.16b, v3.16b 501*4882a593Smuzhiyun sub x0, x0, #16 502*4882a593Smuzhiyun.Lxtsenccts: 503*4882a593Smuzhiyun adr_l x8, .Lcts_permute_table 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun add x1, x1, w4, sxtw /* rewind input pointer */ 506*4882a593Smuzhiyun add w4, w4, #16 /* # bytes in final block */ 507*4882a593Smuzhiyun add x9, x8, #32 508*4882a593Smuzhiyun add x8, x8, x4 509*4882a593Smuzhiyun sub x9, x9, x4 510*4882a593Smuzhiyun add x4, x0, x4 /* output address of final block */ 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun ld1 {v1.16b}, [x1] /* load final block */ 513*4882a593Smuzhiyun ld1 {v2.16b}, [x8] 514*4882a593Smuzhiyun ld1 {v3.16b}, [x9] 515*4882a593Smuzhiyun 516*4882a593Smuzhiyun tbl v2.16b, {v0.16b}, v2.16b 517*4882a593Smuzhiyun tbx v0.16b, {v1.16b}, v3.16b 518*4882a593Smuzhiyun st1 {v2.16b}, [x4] /* overlapping stores */ 519*4882a593Smuzhiyun mov w4, wzr 520*4882a593Smuzhiyun b .Lxtsencctsout 521*4882a593SmuzhiyunAES_FUNC_END(aes_xts_encrypt) 522*4882a593Smuzhiyun 523*4882a593SmuzhiyunAES_FUNC_START(aes_xts_decrypt) 524*4882a593Smuzhiyun stp x29, x30, [sp, #-16]! 525*4882a593Smuzhiyun mov x29, sp 526*4882a593Smuzhiyun 527*4882a593Smuzhiyun /* subtract 16 bytes if we are doing CTS */ 528*4882a593Smuzhiyun sub w8, w4, #0x10 529*4882a593Smuzhiyun tst w4, #0xf 530*4882a593Smuzhiyun csel w4, w4, w8, eq 531*4882a593Smuzhiyun 532*4882a593Smuzhiyun ld1 {v4.16b}, [x6] 533*4882a593Smuzhiyun xts_load_mask v8 534*4882a593Smuzhiyun xts_cts_skip_tw w7, .Lxtsdecskiptw 535*4882a593Smuzhiyun cbz w7, .Lxtsdecnotfirst 536*4882a593Smuzhiyun 537*4882a593Smuzhiyun enc_prepare w3, x5, x8 538*4882a593Smuzhiyun encrypt_block v4, w3, x5, x8, w7 /* first tweak */ 539*4882a593Smuzhiyun.Lxtsdecskiptw: 540*4882a593Smuzhiyun dec_prepare w3, x2, x8 541*4882a593Smuzhiyun b .LxtsdecNx 542*4882a593Smuzhiyun 543*4882a593Smuzhiyun.Lxtsdecnotfirst: 544*4882a593Smuzhiyun dec_prepare w3, x2, x8 545*4882a593Smuzhiyun.LxtsdecloopNx: 546*4882a593Smuzhiyun next_tweak v4, v4, v8 547*4882a593Smuzhiyun.LxtsdecNx: 548*4882a593Smuzhiyun subs w4, w4, #64 549*4882a593Smuzhiyun bmi .Lxtsdec1x 550*4882a593Smuzhiyun ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 551*4882a593Smuzhiyun next_tweak v5, v4, v8 552*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 553*4882a593Smuzhiyun next_tweak v6, v5, v8 554*4882a593Smuzhiyun eor v1.16b, v1.16b, v5.16b 555*4882a593Smuzhiyun eor v2.16b, v2.16b, v6.16b 556*4882a593Smuzhiyun next_tweak v7, v6, v8 557*4882a593Smuzhiyun eor v3.16b, v3.16b, v7.16b 558*4882a593Smuzhiyun bl aes_decrypt_block4x 559*4882a593Smuzhiyun eor v3.16b, v3.16b, v7.16b 560*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 561*4882a593Smuzhiyun eor v1.16b, v1.16b, v5.16b 562*4882a593Smuzhiyun eor v2.16b, v2.16b, v6.16b 563*4882a593Smuzhiyun st1 {v0.16b-v3.16b}, [x0], #64 564*4882a593Smuzhiyun mov v4.16b, v7.16b 565*4882a593Smuzhiyun cbz w4, .Lxtsdecout 566*4882a593Smuzhiyun xts_reload_mask v8 567*4882a593Smuzhiyun b .LxtsdecloopNx 568*4882a593Smuzhiyun.Lxtsdec1x: 569*4882a593Smuzhiyun adds w4, w4, #64 570*4882a593Smuzhiyun beq .Lxtsdecout 571*4882a593Smuzhiyun subs w4, w4, #16 572*4882a593Smuzhiyun.Lxtsdecloop: 573*4882a593Smuzhiyun ld1 {v0.16b}, [x1], #16 574*4882a593Smuzhiyun bmi .Lxtsdeccts 575*4882a593Smuzhiyun.Lxtsdecctsout: 576*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 577*4882a593Smuzhiyun decrypt_block v0, w3, x2, x8, w7 578*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 579*4882a593Smuzhiyun st1 {v0.16b}, [x0], #16 580*4882a593Smuzhiyun cbz w4, .Lxtsdecout 581*4882a593Smuzhiyun subs w4, w4, #16 582*4882a593Smuzhiyun next_tweak v4, v4, v8 583*4882a593Smuzhiyun b .Lxtsdecloop 584*4882a593Smuzhiyun.Lxtsdecout: 585*4882a593Smuzhiyun st1 {v4.16b}, [x6] 586*4882a593Smuzhiyun ldp x29, x30, [sp], #16 587*4882a593Smuzhiyun ret 588*4882a593Smuzhiyun 589*4882a593Smuzhiyun.Lxtsdeccts: 590*4882a593Smuzhiyun adr_l x8, .Lcts_permute_table 591*4882a593Smuzhiyun 592*4882a593Smuzhiyun add x1, x1, w4, sxtw /* rewind input pointer */ 593*4882a593Smuzhiyun add w4, w4, #16 /* # bytes in final block */ 594*4882a593Smuzhiyun add x9, x8, #32 595*4882a593Smuzhiyun add x8, x8, x4 596*4882a593Smuzhiyun sub x9, x9, x4 597*4882a593Smuzhiyun add x4, x0, x4 /* output address of final block */ 598*4882a593Smuzhiyun 599*4882a593Smuzhiyun next_tweak v5, v4, v8 600*4882a593Smuzhiyun 601*4882a593Smuzhiyun ld1 {v1.16b}, [x1] /* load final block */ 602*4882a593Smuzhiyun ld1 {v2.16b}, [x8] 603*4882a593Smuzhiyun ld1 {v3.16b}, [x9] 604*4882a593Smuzhiyun 605*4882a593Smuzhiyun eor v0.16b, v0.16b, v5.16b 606*4882a593Smuzhiyun decrypt_block v0, w3, x2, x8, w7 607*4882a593Smuzhiyun eor v0.16b, v0.16b, v5.16b 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun tbl v2.16b, {v0.16b}, v2.16b 610*4882a593Smuzhiyun tbx v0.16b, {v1.16b}, v3.16b 611*4882a593Smuzhiyun 612*4882a593Smuzhiyun st1 {v2.16b}, [x4] /* overlapping stores */ 613*4882a593Smuzhiyun mov w4, wzr 614*4882a593Smuzhiyun b .Lxtsdecctsout 615*4882a593SmuzhiyunAES_FUNC_END(aes_xts_decrypt) 616*4882a593Smuzhiyun 617*4882a593Smuzhiyun /* 618*4882a593Smuzhiyun * aes_mac_update(u8 const in[], u32 const rk[], int rounds, 619*4882a593Smuzhiyun * int blocks, u8 dg[], int enc_before, int enc_after) 620*4882a593Smuzhiyun */ 621*4882a593SmuzhiyunAES_FUNC_START(aes_mac_update) 622*4882a593Smuzhiyun ld1 {v0.16b}, [x4] /* get dg */ 623*4882a593Smuzhiyun enc_prepare w2, x1, x7 624*4882a593Smuzhiyun cbz w5, .Lmacloop4x 625*4882a593Smuzhiyun 626*4882a593Smuzhiyun encrypt_block v0, w2, x1, x7, w8 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun.Lmacloop4x: 629*4882a593Smuzhiyun subs w3, w3, #4 630*4882a593Smuzhiyun bmi .Lmac1x 631*4882a593Smuzhiyun ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ 632*4882a593Smuzhiyun eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ 633*4882a593Smuzhiyun encrypt_block v0, w2, x1, x7, w8 634*4882a593Smuzhiyun eor v0.16b, v0.16b, v2.16b 635*4882a593Smuzhiyun encrypt_block v0, w2, x1, x7, w8 636*4882a593Smuzhiyun eor v0.16b, v0.16b, v3.16b 637*4882a593Smuzhiyun encrypt_block v0, w2, x1, x7, w8 638*4882a593Smuzhiyun eor v0.16b, v0.16b, v4.16b 639*4882a593Smuzhiyun cmp w3, wzr 640*4882a593Smuzhiyun csinv x5, x6, xzr, eq 641*4882a593Smuzhiyun cbz w5, .Lmacout 642*4882a593Smuzhiyun encrypt_block v0, w2, x1, x7, w8 643*4882a593Smuzhiyun st1 {v0.16b}, [x4] /* return dg */ 644*4882a593Smuzhiyun cond_yield .Lmacout, x7, x8 645*4882a593Smuzhiyun b .Lmacloop4x 646*4882a593Smuzhiyun.Lmac1x: 647*4882a593Smuzhiyun add w3, w3, #4 648*4882a593Smuzhiyun.Lmacloop: 649*4882a593Smuzhiyun cbz w3, .Lmacout 650*4882a593Smuzhiyun ld1 {v1.16b}, [x0], #16 /* get next pt block */ 651*4882a593Smuzhiyun eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ 652*4882a593Smuzhiyun 653*4882a593Smuzhiyun subs w3, w3, #1 654*4882a593Smuzhiyun csinv x5, x6, xzr, eq 655*4882a593Smuzhiyun cbz w5, .Lmacout 656*4882a593Smuzhiyun 657*4882a593Smuzhiyun.Lmacenc: 658*4882a593Smuzhiyun encrypt_block v0, w2, x1, x7, w8 659*4882a593Smuzhiyun b .Lmacloop 660*4882a593Smuzhiyun 661*4882a593Smuzhiyun.Lmacout: 662*4882a593Smuzhiyun st1 {v0.16b}, [x4] /* return dg */ 663*4882a593Smuzhiyun mov w0, w3 664*4882a593Smuzhiyun ret 665*4882a593SmuzhiyunAES_FUNC_END(aes_mac_update) 666