132b31808SJens Wiklander /*
232b31808SJens Wiklander * Armv8-A Cryptographic Extension support functions for Aarch64
332b31808SJens Wiklander *
432b31808SJens Wiklander * Copyright The Mbed TLS Contributors
5*b0563631STom Van Eyck * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
632b31808SJens Wiklander */
732b31808SJens Wiklander
8*b0563631STom Van Eyck #if defined(__clang__) && (__clang_major__ >= 4)
9*b0563631STom Van Eyck
10*b0563631STom Van Eyck /* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11*b0563631STom Van Eyck * but that is defined by build_info.h, and we need this block to happen first. */
12*b0563631STom Van Eyck #if defined(__ARM_ARCH)
13*b0563631STom Van Eyck #if __ARM_ARCH >= 8
14*b0563631STom Van Eyck #define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15*b0563631STom Van Eyck #endif
16*b0563631STom Van Eyck #endif
17*b0563631STom Van Eyck
18*b0563631STom Van Eyck #if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
1932b31808SJens Wiklander /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
2032b31808SJens Wiklander *
2132b31808SJens Wiklander * The intrinsic declaration are guarded by predefined ACLE macros in clang:
2232b31808SJens Wiklander * these are normally only enabled by the -march option on the command line.
2332b31808SJens Wiklander * By defining the macros ourselves we gain access to those declarations without
2432b31808SJens Wiklander * requiring -march on the command line.
2532b31808SJens Wiklander *
26*b0563631STom Van Eyck * `arm_neon.h` is included by common.h, so we put these defines
2732b31808SJens Wiklander * at the top of this file, before any includes.
2832b31808SJens Wiklander */
2932b31808SJens Wiklander #define __ARM_FEATURE_CRYPTO 1
3032b31808SJens Wiklander /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
3132b31808SJens Wiklander *
3232b31808SJens Wiklander * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
3332b31808SJens Wiklander * for older compilers.
3432b31808SJens Wiklander */
3532b31808SJens Wiklander #define __ARM_FEATURE_AES 1
3632b31808SJens Wiklander #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
3732b31808SJens Wiklander #endif
3832b31808SJens Wiklander
39*b0563631STom Van Eyck #endif /* defined(__clang__) && (__clang_major__ >= 4) */
40*b0563631STom Van Eyck
4132b31808SJens Wiklander #include <string.h>
4232b31808SJens Wiklander #include "common.h"
4332b31808SJens Wiklander
4432b31808SJens Wiklander #if defined(MBEDTLS_AESCE_C)
4532b31808SJens Wiklander
4632b31808SJens Wiklander #include "aesce.h"
4732b31808SJens Wiklander
48*b0563631STom Van Eyck #if defined(MBEDTLS_AESCE_HAVE_CODE)
4932b31808SJens Wiklander
50*b0563631STom Van Eyck /* Compiler version checks. */
5132b31808SJens Wiklander #if defined(__clang__)
52*b0563631STom Van Eyck # if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
53*b0563631STom Van Eyck # error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
54*b0563631STom Van Eyck # elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55*b0563631STom Van Eyck # error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
5632b31808SJens Wiklander # endif
5732b31808SJens Wiklander #elif defined(__GNUC__)
5832b31808SJens Wiklander # if __GNUC__ < 6
59*b0563631STom Van Eyck # error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
6032b31808SJens Wiklander # endif
61*b0563631STom Van Eyck #elif defined(_MSC_VER)
62*b0563631STom Van Eyck /* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63*b0563631STom Van Eyck * please update this and document of `MBEDTLS_AESCE_C` in
64*b0563631STom Van Eyck * `mbedtls_config.h`. */
65*b0563631STom Van Eyck # if _MSC_VER < 1929
66*b0563631STom Van Eyck # error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67*b0563631STom Van Eyck # endif
68*b0563631STom Van Eyck #elif defined(__ARMCC_VERSION)
69*b0563631STom Van Eyck # if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70*b0563631STom Van Eyck /* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71*b0563631STom Van Eyck * If someone verified that, please update this and document of
72*b0563631STom Van Eyck * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73*b0563631STom Van Eyck # error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74*b0563631STom Van Eyck # elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75*b0563631STom Van Eyck # error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76*b0563631STom Van Eyck # endif
77*b0563631STom Van Eyck #endif
78*b0563631STom Van Eyck
79*b0563631STom Van Eyck #if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
80*b0563631STom Van Eyck defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
81*b0563631STom Van Eyck # if defined(__ARMCOMPILER_VERSION)
82*b0563631STom Van Eyck # if __ARMCOMPILER_VERSION <= 6090000
83*b0563631STom Van Eyck # error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
8432b31808SJens Wiklander # else
85*b0563631STom Van Eyck # pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
86*b0563631STom Van Eyck # define MBEDTLS_POP_TARGET_PRAGMA
8732b31808SJens Wiklander # endif
88*b0563631STom Van Eyck # elif defined(__clang__)
89*b0563631STom Van Eyck # pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
90*b0563631STom Van Eyck # define MBEDTLS_POP_TARGET_PRAGMA
91*b0563631STom Van Eyck # elif defined(__GNUC__)
92*b0563631STom Van Eyck # pragma GCC push_options
93*b0563631STom Van Eyck # pragma GCC target ("+crypto")
94*b0563631STom Van Eyck # define MBEDTLS_POP_TARGET_PRAGMA
95*b0563631STom Van Eyck # elif defined(_MSC_VER)
96*b0563631STom Van Eyck # error "Required feature(__ARM_FEATURE_AES) is not enabled."
97*b0563631STom Van Eyck # endif
98*b0563631STom Van Eyck #endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
99*b0563631STom Van Eyck MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
10032b31808SJens Wiklander
101*b0563631STom Van Eyck #if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
10232b31808SJens Wiklander
10332b31808SJens Wiklander #include <sys/auxv.h>
104*b0563631STom Van Eyck #if !defined(HWCAP_NEON)
105*b0563631STom Van Eyck #define HWCAP_NEON (1 << 12)
106*b0563631STom Van Eyck #endif
107*b0563631STom Van Eyck #if !defined(HWCAP2_AES)
108*b0563631STom Van Eyck #define HWCAP2_AES (1 << 0)
109*b0563631STom Van Eyck #endif
110*b0563631STom Van Eyck #if !defined(HWCAP_AES)
111*b0563631STom Van Eyck #define HWCAP_AES (1 << 3)
112*b0563631STom Van Eyck #endif
113*b0563631STom Van Eyck #if !defined(HWCAP_ASIMD)
114*b0563631STom Van Eyck #define HWCAP_ASIMD (1 << 1)
11532b31808SJens Wiklander #endif
11632b31808SJens Wiklander
117*b0563631STom Van Eyck signed char mbedtls_aesce_has_support_result = -1;
118*b0563631STom Van Eyck
119*b0563631STom Van Eyck #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
12032b31808SJens Wiklander /*
12132b31808SJens Wiklander * AES instruction support detection routine
12232b31808SJens Wiklander */
mbedtls_aesce_has_support_impl(void)123*b0563631STom Van Eyck int mbedtls_aesce_has_support_impl(void)
12432b31808SJens Wiklander {
125*b0563631STom Van Eyck /* To avoid many calls to getauxval, cache the result. This is
126*b0563631STom Van Eyck * thread-safe, because we store the result in a char so cannot
127*b0563631STom Van Eyck * be vulnerable to non-atomic updates.
128*b0563631STom Van Eyck * It is possible that we could end up setting result more than
129*b0563631STom Van Eyck * once, but that is harmless.
130*b0563631STom Van Eyck */
131*b0563631STom Van Eyck if (mbedtls_aesce_has_support_result == -1) {
132*b0563631STom Van Eyck #if defined(MBEDTLS_ARCH_IS_ARM32)
13332b31808SJens Wiklander unsigned long auxval = getauxval(AT_HWCAP);
134*b0563631STom Van Eyck unsigned long auxval2 = getauxval(AT_HWCAP2);
135*b0563631STom Van Eyck if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
136*b0563631STom Van Eyck ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
137*b0563631STom Van Eyck mbedtls_aesce_has_support_result = 1;
138*b0563631STom Van Eyck } else {
139*b0563631STom Van Eyck mbedtls_aesce_has_support_result = 0;
140*b0563631STom Van Eyck }
14132b31808SJens Wiklander #else
142*b0563631STom Van Eyck unsigned long auxval = getauxval(AT_HWCAP);
143*b0563631STom Van Eyck if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
144*b0563631STom Van Eyck (HWCAP_ASIMD | HWCAP_AES)) {
145*b0563631STom Van Eyck mbedtls_aesce_has_support_result = 1;
146*b0563631STom Van Eyck } else {
147*b0563631STom Van Eyck mbedtls_aesce_has_support_result = 0;
148*b0563631STom Van Eyck }
14932b31808SJens Wiklander #endif
15032b31808SJens Wiklander }
151*b0563631STom Van Eyck return mbedtls_aesce_has_support_result;
152*b0563631STom Van Eyck }
153*b0563631STom Van Eyck #endif
15432b31808SJens Wiklander
155*b0563631STom Van Eyck #endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
156*b0563631STom Van Eyck
157*b0563631STom Van Eyck /* Single round of AESCE encryption */
158*b0563631STom Van Eyck #define AESCE_ENCRYPT_ROUND \
159*b0563631STom Van Eyck block = vaeseq_u8(block, vld1q_u8(keys)); \
160*b0563631STom Van Eyck block = vaesmcq_u8(block); \
161*b0563631STom Van Eyck keys += 16
162*b0563631STom Van Eyck /* Two rounds of AESCE encryption */
163*b0563631STom Van Eyck #define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
164*b0563631STom Van Eyck
165*b0563631STom Van Eyck MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
aesce_encrypt_block(uint8x16_t block,unsigned char * keys,int rounds)16632b31808SJens Wiklander static uint8x16_t aesce_encrypt_block(uint8x16_t block,
16732b31808SJens Wiklander unsigned char *keys,
16832b31808SJens Wiklander int rounds)
16932b31808SJens Wiklander {
170*b0563631STom Van Eyck /* 10, 12 or 14 rounds. Unroll loop. */
171*b0563631STom Van Eyck if (rounds == 10) {
172*b0563631STom Van Eyck goto rounds_10;
17332b31808SJens Wiklander }
174*b0563631STom Van Eyck if (rounds == 12) {
175*b0563631STom Van Eyck goto rounds_12;
176*b0563631STom Van Eyck }
177*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND_X2;
178*b0563631STom Van Eyck rounds_12:
179*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND_X2;
180*b0563631STom Van Eyck rounds_10:
181*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND_X2;
182*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND_X2;
183*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND_X2;
184*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND_X2;
185*b0563631STom Van Eyck AESCE_ENCRYPT_ROUND;
18632b31808SJens Wiklander
18732b31808SJens Wiklander /* AES AddRoundKey for the previous round.
18832b31808SJens Wiklander * SubBytes, ShiftRows for the final round. */
189*b0563631STom Van Eyck block = vaeseq_u8(block, vld1q_u8(keys));
190*b0563631STom Van Eyck keys += 16;
19132b31808SJens Wiklander
19232b31808SJens Wiklander /* Final round: no MixColumns */
19332b31808SJens Wiklander
19432b31808SJens Wiklander /* Final AddRoundKey */
195*b0563631STom Van Eyck block = veorq_u8(block, vld1q_u8(keys));
19632b31808SJens Wiklander
19732b31808SJens Wiklander return block;
19832b31808SJens Wiklander }
19932b31808SJens Wiklander
200*b0563631STom Van Eyck /* Single round of AESCE decryption
201*b0563631STom Van Eyck *
202*b0563631STom Van Eyck * AES AddRoundKey, SubBytes, ShiftRows
203*b0563631STom Van Eyck *
204*b0563631STom Van Eyck * block = vaesdq_u8(block, vld1q_u8(keys));
205*b0563631STom Van Eyck *
206*b0563631STom Van Eyck * AES inverse MixColumns for the next round.
20732b31808SJens Wiklander *
20832b31808SJens Wiklander * This means that we switch the order of the inverse AddRoundKey and
20932b31808SJens Wiklander * inverse MixColumns operations. We have to do this as AddRoundKey is
21032b31808SJens Wiklander * done in an atomic instruction together with the inverses of SubBytes
21132b31808SJens Wiklander * and ShiftRows.
21232b31808SJens Wiklander *
21332b31808SJens Wiklander * It works because MixColumns is a linear operation over GF(2^8) and
21432b31808SJens Wiklander * AddRoundKey is an exclusive or, which is equivalent to addition over
21532b31808SJens Wiklander * GF(2^8). (The inverse of MixColumns needs to be applied to the
21632b31808SJens Wiklander * affected round keys separately which has been done when the
217*b0563631STom Van Eyck * decryption round keys were calculated.)
218*b0563631STom Van Eyck *
219*b0563631STom Van Eyck * block = vaesimcq_u8(block);
220*b0563631STom Van Eyck */
221*b0563631STom Van Eyck #define AESCE_DECRYPT_ROUND \
222*b0563631STom Van Eyck block = vaesdq_u8(block, vld1q_u8(keys)); \
223*b0563631STom Van Eyck block = vaesimcq_u8(block); \
224*b0563631STom Van Eyck keys += 16
225*b0563631STom Van Eyck /* Two rounds of AESCE decryption */
226*b0563631STom Van Eyck #define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
227*b0563631STom Van Eyck
228*b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
aesce_decrypt_block(uint8x16_t block,unsigned char * keys,int rounds)229*b0563631STom Van Eyck static uint8x16_t aesce_decrypt_block(uint8x16_t block,
230*b0563631STom Van Eyck unsigned char *keys,
231*b0563631STom Van Eyck int rounds)
232*b0563631STom Van Eyck {
233*b0563631STom Van Eyck /* 10, 12 or 14 rounds. Unroll loop. */
234*b0563631STom Van Eyck if (rounds == 10) {
235*b0563631STom Van Eyck goto rounds_10;
23632b31808SJens Wiklander }
237*b0563631STom Van Eyck if (rounds == 12) {
238*b0563631STom Van Eyck goto rounds_12;
239*b0563631STom Van Eyck }
240*b0563631STom Van Eyck AESCE_DECRYPT_ROUND_X2;
241*b0563631STom Van Eyck rounds_12:
242*b0563631STom Van Eyck AESCE_DECRYPT_ROUND_X2;
243*b0563631STom Van Eyck rounds_10:
244*b0563631STom Van Eyck AESCE_DECRYPT_ROUND_X2;
245*b0563631STom Van Eyck AESCE_DECRYPT_ROUND_X2;
246*b0563631STom Van Eyck AESCE_DECRYPT_ROUND_X2;
247*b0563631STom Van Eyck AESCE_DECRYPT_ROUND_X2;
248*b0563631STom Van Eyck AESCE_DECRYPT_ROUND;
24932b31808SJens Wiklander
25032b31808SJens Wiklander /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
25132b31808SJens Wiklander * last full round. */
252*b0563631STom Van Eyck block = vaesdq_u8(block, vld1q_u8(keys));
253*b0563631STom Van Eyck keys += 16;
25432b31808SJens Wiklander
25532b31808SJens Wiklander /* Inverse AddRoundKey for inverting the initial round key addition. */
256*b0563631STom Van Eyck block = veorq_u8(block, vld1q_u8(keys));
25732b31808SJens Wiklander
25832b31808SJens Wiklander return block;
25932b31808SJens Wiklander }
260*b0563631STom Van Eyck #endif
26132b31808SJens Wiklander
26232b31808SJens Wiklander /*
26332b31808SJens Wiklander * AES-ECB block en(de)cryption
26432b31808SJens Wiklander */
mbedtls_aesce_crypt_ecb(mbedtls_aes_context * ctx,int mode,const unsigned char input[16],unsigned char output[16])26532b31808SJens Wiklander int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
26632b31808SJens Wiklander int mode,
26732b31808SJens Wiklander const unsigned char input[16],
26832b31808SJens Wiklander unsigned char output[16])
26932b31808SJens Wiklander {
27032b31808SJens Wiklander uint8x16_t block = vld1q_u8(&input[0]);
27132b31808SJens Wiklander unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
27232b31808SJens Wiklander
273*b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
274*b0563631STom Van Eyck if (mode == MBEDTLS_AES_DECRYPT) {
27532b31808SJens Wiklander block = aesce_decrypt_block(block, keys, ctx->nr);
276*b0563631STom Van Eyck } else
277*b0563631STom Van Eyck #else
278*b0563631STom Van Eyck (void) mode;
279*b0563631STom Van Eyck #endif
280*b0563631STom Van Eyck {
281*b0563631STom Van Eyck block = aesce_encrypt_block(block, keys, ctx->nr);
28232b31808SJens Wiklander }
28332b31808SJens Wiklander vst1q_u8(&output[0], block);
28432b31808SJens Wiklander
28532b31808SJens Wiklander return 0;
28632b31808SJens Wiklander }
28732b31808SJens Wiklander
28832b31808SJens Wiklander /*
28932b31808SJens Wiklander * Compute decryption round keys from encryption round keys
29032b31808SJens Wiklander */
291*b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
mbedtls_aesce_inverse_key(unsigned char * invkey,const unsigned char * fwdkey,int nr)29232b31808SJens Wiklander void mbedtls_aesce_inverse_key(unsigned char *invkey,
29332b31808SJens Wiklander const unsigned char *fwdkey,
29432b31808SJens Wiklander int nr)
29532b31808SJens Wiklander {
29632b31808SJens Wiklander int i, j;
29732b31808SJens Wiklander j = nr;
29832b31808SJens Wiklander vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
29932b31808SJens Wiklander for (i = 1, j--; j > 0; i++, j--) {
30032b31808SJens Wiklander vst1q_u8(invkey + i * 16,
30132b31808SJens Wiklander vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
30232b31808SJens Wiklander }
30332b31808SJens Wiklander vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
30432b31808SJens Wiklander
30532b31808SJens Wiklander }
306*b0563631STom Van Eyck #endif
30732b31808SJens Wiklander
aes_rot_word(uint32_t word)30832b31808SJens Wiklander static inline uint32_t aes_rot_word(uint32_t word)
30932b31808SJens Wiklander {
31032b31808SJens Wiklander return (word << (32 - 8)) | (word >> 8);
31132b31808SJens Wiklander }
31232b31808SJens Wiklander
aes_sub_word(uint32_t in)31332b31808SJens Wiklander static inline uint32_t aes_sub_word(uint32_t in)
31432b31808SJens Wiklander {
31532b31808SJens Wiklander uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
31632b31808SJens Wiklander uint8x16_t zero = vdupq_n_u8(0);
31732b31808SJens Wiklander
31832b31808SJens Wiklander /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
31932b31808SJens Wiklander * the correct result as ShiftRows doesn't change the first row. */
32032b31808SJens Wiklander v = vaeseq_u8(zero, v);
32132b31808SJens Wiklander return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
32232b31808SJens Wiklander }
32332b31808SJens Wiklander
32432b31808SJens Wiklander /*
32532b31808SJens Wiklander * Key expansion function
32632b31808SJens Wiklander */
aesce_setkey_enc(unsigned char * rk,const unsigned char * key,const size_t key_bit_length)32732b31808SJens Wiklander static void aesce_setkey_enc(unsigned char *rk,
32832b31808SJens Wiklander const unsigned char *key,
32932b31808SJens Wiklander const size_t key_bit_length)
33032b31808SJens Wiklander {
33132b31808SJens Wiklander static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
33232b31808SJens Wiklander 0x20, 0x40, 0x80, 0x1b, 0x36 };
33332b31808SJens Wiklander /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
33432b31808SJens Wiklander * - Section 5, Nr = Nk + 6
33532b31808SJens Wiklander * - Section 5.2, the length of round keys is Nb*(Nr+1)
33632b31808SJens Wiklander */
337*b0563631STom Van Eyck const size_t key_len_in_words = key_bit_length / 32; /* Nk */
33832b31808SJens Wiklander const size_t round_key_len_in_words = 4; /* Nb */
33932b31808SJens Wiklander const size_t rounds_needed = key_len_in_words + 6; /* Nr */
34032b31808SJens Wiklander const size_t round_keys_len_in_words =
34132b31808SJens Wiklander round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
34232b31808SJens Wiklander const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
34332b31808SJens Wiklander
34432b31808SJens Wiklander memcpy(rk, key, key_len_in_words * 4);
34532b31808SJens Wiklander
34632b31808SJens Wiklander for (uint32_t *rki = (uint32_t *) rk;
34732b31808SJens Wiklander rki + key_len_in_words < rko_end;
34832b31808SJens Wiklander rki += key_len_in_words) {
34932b31808SJens Wiklander
350*b0563631STom Van Eyck size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
35132b31808SJens Wiklander uint32_t *rko;
35232b31808SJens Wiklander rko = rki + key_len_in_words;
35332b31808SJens Wiklander rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
35432b31808SJens Wiklander rko[0] ^= rcon[iteration] ^ rki[0];
35532b31808SJens Wiklander rko[1] = rko[0] ^ rki[1];
35632b31808SJens Wiklander rko[2] = rko[1] ^ rki[2];
35732b31808SJens Wiklander rko[3] = rko[2] ^ rki[3];
35832b31808SJens Wiklander if (rko + key_len_in_words > rko_end) {
35932b31808SJens Wiklander /* Do not write overflow words.*/
36032b31808SJens Wiklander continue;
36132b31808SJens Wiklander }
362*b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
36332b31808SJens Wiklander switch (key_bit_length) {
36432b31808SJens Wiklander case 128:
36532b31808SJens Wiklander break;
36632b31808SJens Wiklander case 192:
36732b31808SJens Wiklander rko[4] = rko[3] ^ rki[4];
36832b31808SJens Wiklander rko[5] = rko[4] ^ rki[5];
36932b31808SJens Wiklander break;
37032b31808SJens Wiklander case 256:
37132b31808SJens Wiklander rko[4] = aes_sub_word(rko[3]) ^ rki[4];
37232b31808SJens Wiklander rko[5] = rko[4] ^ rki[5];
37332b31808SJens Wiklander rko[6] = rko[5] ^ rki[6];
37432b31808SJens Wiklander rko[7] = rko[6] ^ rki[7];
37532b31808SJens Wiklander break;
37632b31808SJens Wiklander }
377*b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
37832b31808SJens Wiklander }
37932b31808SJens Wiklander }
38032b31808SJens Wiklander
38132b31808SJens Wiklander /*
38232b31808SJens Wiklander * Key expansion, wrapper
38332b31808SJens Wiklander */
mbedtls_aesce_setkey_enc(unsigned char * rk,const unsigned char * key,size_t bits)38432b31808SJens Wiklander int mbedtls_aesce_setkey_enc(unsigned char *rk,
38532b31808SJens Wiklander const unsigned char *key,
38632b31808SJens Wiklander size_t bits)
38732b31808SJens Wiklander {
38832b31808SJens Wiklander switch (bits) {
38932b31808SJens Wiklander case 128:
39032b31808SJens Wiklander case 192:
39132b31808SJens Wiklander case 256:
39232b31808SJens Wiklander aesce_setkey_enc(rk, key, bits);
39332b31808SJens Wiklander break;
39432b31808SJens Wiklander default:
39532b31808SJens Wiklander return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
39632b31808SJens Wiklander }
39732b31808SJens Wiklander
39832b31808SJens Wiklander return 0;
39932b31808SJens Wiklander }
40032b31808SJens Wiklander
40132b31808SJens Wiklander #if defined(MBEDTLS_GCM_C)
40232b31808SJens Wiklander
403*b0563631STom Van Eyck #if defined(MBEDTLS_ARCH_IS_ARM32)
404*b0563631STom Van Eyck
405*b0563631STom Van Eyck #if defined(__clang__)
406*b0563631STom Van Eyck /* On clang for A32/T32, work around some missing intrinsics and types which are listed in
407*b0563631STom Van Eyck * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
408*b0563631STom Van Eyck * These are only required for GCM.
409*b0563631STom Van Eyck */
410*b0563631STom Van Eyck #define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
411*b0563631STom Van Eyck
412*b0563631STom Van Eyck typedef uint8x16_t poly128_t;
413*b0563631STom Van Eyck
vmull_p64(poly64_t a,poly64_t b)414*b0563631STom Van Eyck static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
415*b0563631STom Van Eyck {
416*b0563631STom Van Eyck poly128_t r;
417*b0563631STom Van Eyck asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
418*b0563631STom Van Eyck return r;
419*b0563631STom Van Eyck }
420*b0563631STom Van Eyck
421*b0563631STom Van Eyck /* This is set to cause some more missing intrinsics to be defined below */
422*b0563631STom Van Eyck #define COMMON_MISSING_INTRINSICS
423*b0563631STom Van Eyck
vmull_high_p64(poly64x2_t a,poly64x2_t b)424*b0563631STom Van Eyck static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
425*b0563631STom Van Eyck {
426*b0563631STom Van Eyck return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
427*b0563631STom Van Eyck (poly64_t) (vget_high_u64((uint64x2_t) b)));
428*b0563631STom Van Eyck }
429*b0563631STom Van Eyck
430*b0563631STom Van Eyck #endif /* defined(__clang__) */
431*b0563631STom Van Eyck
vrbitq_u8(uint8x16_t x)432*b0563631STom Van Eyck static inline uint8x16_t vrbitq_u8(uint8x16_t x)
433*b0563631STom Van Eyck {
434*b0563631STom Van Eyck /* There is no vrbitq_u8 instruction in A32/T32, so provide
435*b0563631STom Van Eyck * an equivalent non-Neon implementation. Reverse bit order in each
436*b0563631STom Van Eyck * byte with 4x rbit, rev. */
437*b0563631STom Van Eyck asm ("ldm %[p], { r2-r5 } \n\t"
438*b0563631STom Van Eyck "rbit r2, r2 \n\t"
439*b0563631STom Van Eyck "rev r2, r2 \n\t"
440*b0563631STom Van Eyck "rbit r3, r3 \n\t"
441*b0563631STom Van Eyck "rev r3, r3 \n\t"
442*b0563631STom Van Eyck "rbit r4, r4 \n\t"
443*b0563631STom Van Eyck "rev r4, r4 \n\t"
444*b0563631STom Van Eyck "rbit r5, r5 \n\t"
445*b0563631STom Van Eyck "rev r5, r5 \n\t"
446*b0563631STom Van Eyck "stm %[p], { r2-r5 } \n\t"
447*b0563631STom Van Eyck :
448*b0563631STom Van Eyck /* Output: 16 bytes of memory pointed to by &x */
449*b0563631STom Van Eyck "+m" (*(uint8_t(*)[16]) &x)
450*b0563631STom Van Eyck :
451*b0563631STom Van Eyck [p] "r" (&x)
452*b0563631STom Van Eyck :
453*b0563631STom Van Eyck "r2", "r3", "r4", "r5"
454*b0563631STom Van Eyck );
455*b0563631STom Van Eyck return x;
456*b0563631STom Van Eyck }
457*b0563631STom Van Eyck
458*b0563631STom Van Eyck #endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
459*b0563631STom Van Eyck
460*b0563631STom Van Eyck #if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
46132b31808SJens Wiklander /* Some intrinsics are not available for GCC 5.X. */
462*b0563631STom Van Eyck #define COMMON_MISSING_INTRINSICS
463*b0563631STom Van Eyck #endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
464*b0563631STom Van Eyck
465*b0563631STom Van Eyck
466*b0563631STom Van Eyck #if defined(COMMON_MISSING_INTRINSICS)
467*b0563631STom Van Eyck
468*b0563631STom Van Eyck /* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
469*b0563631STom Van Eyck
47032b31808SJens Wiklander #define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
47132b31808SJens Wiklander #define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
472*b0563631STom Van Eyck
vget_low_p64(poly64x2_t a)473*b0563631STom Van Eyck static inline poly64x1_t vget_low_p64(poly64x2_t a)
47432b31808SJens Wiklander {
475*b0563631STom Van Eyck uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
476*b0563631STom Van Eyck return (poly64x1_t) r;
477*b0563631STom Van Eyck
47832b31808SJens Wiklander }
479*b0563631STom Van Eyck
480*b0563631STom Van Eyck #endif /* COMMON_MISSING_INTRINSICS */
48132b31808SJens Wiklander
48232b31808SJens Wiklander /* vmull_p64/vmull_high_p64 wrappers.
48332b31808SJens Wiklander *
48432b31808SJens Wiklander * Older compilers miss some intrinsic functions for `poly*_t`. We use
48532b31808SJens Wiklander * uint8x16_t and uint8x16x3_t as input/output parameters.
48632b31808SJens Wiklander */
487*b0563631STom Van Eyck #if defined(MBEDTLS_COMPILER_IS_GCC)
488*b0563631STom Van Eyck /* GCC reports incompatible type error without cast. GCC think poly64_t and
489*b0563631STom Van Eyck * poly64x1_t are different, that is different with MSVC and Clang. */
490*b0563631STom Van Eyck #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
491*b0563631STom Van Eyck #else
492*b0563631STom Van Eyck /* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
493*b0563631STom Van Eyck * error with/without cast. And I think poly64_t and poly64x1_t are same, no
494*b0563631STom Van Eyck * cast for clang also. */
495*b0563631STom Van Eyck #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
496*b0563631STom Van Eyck #endif /* MBEDTLS_COMPILER_IS_GCC */
497*b0563631STom Van Eyck
pmull_low(uint8x16_t a,uint8x16_t b)49832b31808SJens Wiklander static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
49932b31808SJens Wiklander {
500*b0563631STom Van Eyck
50132b31808SJens Wiklander return vreinterpretq_u8_p128(
502*b0563631STom Van Eyck MBEDTLS_VMULL_P64(
50332b31808SJens Wiklander (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
504*b0563631STom Van Eyck (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
505*b0563631STom Van Eyck ));
50632b31808SJens Wiklander }
50732b31808SJens Wiklander
pmull_high(uint8x16_t a,uint8x16_t b)50832b31808SJens Wiklander static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
50932b31808SJens Wiklander {
51032b31808SJens Wiklander return vreinterpretq_u8_p128(
51132b31808SJens Wiklander vmull_high_p64(vreinterpretq_p64_u8(a),
51232b31808SJens Wiklander vreinterpretq_p64_u8(b)));
51332b31808SJens Wiklander }
51432b31808SJens Wiklander
51532b31808SJens Wiklander /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
51632b31808SJens Wiklander * `x^128 + x^7 + x^2 + x + 1`.
51732b31808SJens Wiklander *
51832b31808SJens Wiklander * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
51932b31808SJens Wiklander * multiplies to generate a 128b.
52032b31808SJens Wiklander *
52132b31808SJens Wiklander * `poly_mult_128` executes polynomial multiplication and outputs 256b that
52232b31808SJens Wiklander * represented by 3 128b due to code size optimization.
52332b31808SJens Wiklander *
52432b31808SJens Wiklander * Output layout:
52532b31808SJens Wiklander * | | | |
52632b31808SJens Wiklander * |------------|-------------|-------------|
52732b31808SJens Wiklander * | ret.val[0] | h3:h2:00:00 | high 128b |
52832b31808SJens Wiklander * | ret.val[1] | :m2:m1:00 | middle 128b |
52932b31808SJens Wiklander * | ret.val[2] | : :l1:l0 | low 128b |
53032b31808SJens Wiklander */
poly_mult_128(uint8x16_t a,uint8x16_t b)53132b31808SJens Wiklander static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
53232b31808SJens Wiklander {
53332b31808SJens Wiklander uint8x16x3_t ret;
53432b31808SJens Wiklander uint8x16_t h, m, l; /* retval high/middle/low */
53532b31808SJens Wiklander uint8x16_t c, d, e;
53632b31808SJens Wiklander
53732b31808SJens Wiklander h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
53832b31808SJens Wiklander l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
53932b31808SJens Wiklander c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
54032b31808SJens Wiklander d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
54132b31808SJens Wiklander e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
54232b31808SJens Wiklander m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
54332b31808SJens Wiklander
54432b31808SJens Wiklander ret.val[0] = h;
54532b31808SJens Wiklander ret.val[1] = m;
54632b31808SJens Wiklander ret.val[2] = l;
54732b31808SJens Wiklander return ret;
54832b31808SJens Wiklander }
54932b31808SJens Wiklander
55032b31808SJens Wiklander /*
55132b31808SJens Wiklander * Modulo reduction.
55232b31808SJens Wiklander *
55332b31808SJens Wiklander * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
55432b31808SJens Wiklander *
55532b31808SJens Wiklander * Section 4.3
55632b31808SJens Wiklander *
55732b31808SJens Wiklander * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
55832b31808SJens Wiklander * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
55932b31808SJens Wiklander * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
56032b31808SJens Wiklander * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
56132b31808SJens Wiklander * simply multiply the higher part of the operand by r(z) and add it to l(z). If
56232b31808SJens Wiklander * the result is still larger than 128 bits, we reduce again.
56332b31808SJens Wiklander */
poly_mult_reduce(uint8x16x3_t input)56432b31808SJens Wiklander static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
56532b31808SJens Wiklander {
56632b31808SJens Wiklander uint8x16_t const ZERO = vdupq_n_u8(0);
567*b0563631STom Van Eyck
56832b31808SJens Wiklander uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
569*b0563631STom Van Eyck #if defined(__GNUC__)
570*b0563631STom Van Eyck /* use 'asm' as an optimisation barrier to prevent loading MODULO from
571*b0563631STom Van Eyck * memory. It is for GNUC compatible compilers.
572*b0563631STom Van Eyck */
573*b0563631STom Van Eyck asm volatile ("" : "+w" (r));
574*b0563631STom Van Eyck #endif
57532b31808SJens Wiklander uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
57632b31808SJens Wiklander uint8x16_t h, m, l; /* input high/middle/low 128b */
57732b31808SJens Wiklander uint8x16_t c, d, e, f, g, n, o;
57832b31808SJens Wiklander h = input.val[0]; /* h3:h2:00:00 */
57932b31808SJens Wiklander m = input.val[1]; /* :m2:m1:00 */
58032b31808SJens Wiklander l = input.val[2]; /* : :l1:l0 */
58132b31808SJens Wiklander c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
58232b31808SJens Wiklander d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
58332b31808SJens Wiklander e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
58432b31808SJens Wiklander f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
58532b31808SJens Wiklander g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
58632b31808SJens Wiklander n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
58732b31808SJens Wiklander o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
58832b31808SJens Wiklander return veorq_u8(o, g); /* = o1:o0 + g1:00 */
58932b31808SJens Wiklander }
59032b31808SJens Wiklander
59132b31808SJens Wiklander /*
59232b31808SJens Wiklander * GCM multiplication: c = a times b in GF(2^128)
59332b31808SJens Wiklander */
mbedtls_aesce_gcm_mult(unsigned char c[16],const unsigned char a[16],const unsigned char b[16])59432b31808SJens Wiklander void mbedtls_aesce_gcm_mult(unsigned char c[16],
59532b31808SJens Wiklander const unsigned char a[16],
59632b31808SJens Wiklander const unsigned char b[16])
59732b31808SJens Wiklander {
59832b31808SJens Wiklander uint8x16_t va, vb, vc;
59932b31808SJens Wiklander va = vrbitq_u8(vld1q_u8(&a[0]));
60032b31808SJens Wiklander vb = vrbitq_u8(vld1q_u8(&b[0]));
60132b31808SJens Wiklander vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
60232b31808SJens Wiklander vst1q_u8(&c[0], vc);
60332b31808SJens Wiklander }
60432b31808SJens Wiklander
60532b31808SJens Wiklander #endif /* MBEDTLS_GCM_C */
60632b31808SJens Wiklander
60732b31808SJens Wiklander #if defined(MBEDTLS_POP_TARGET_PRAGMA)
60832b31808SJens Wiklander #if defined(__clang__)
60932b31808SJens Wiklander #pragma clang attribute pop
61032b31808SJens Wiklander #elif defined(__GNUC__)
61132b31808SJens Wiklander #pragma GCC pop_options
61232b31808SJens Wiklander #endif
61332b31808SJens Wiklander #undef MBEDTLS_POP_TARGET_PRAGMA
61432b31808SJens Wiklander #endif
61532b31808SJens Wiklander
616*b0563631STom Van Eyck #endif /* MBEDTLS_AESCE_HAVE_CODE */
61732b31808SJens Wiklander
61832b31808SJens Wiklander #endif /* MBEDTLS_AESCE_C */
619