xref: /optee_os/lib/libmbedtls/mbedtls/library/aesce.c (revision b0563631928755fe864b97785160fb3088e9efdc)
132b31808SJens Wiklander /*
232b31808SJens Wiklander  *  Armv8-A Cryptographic Extension support functions for Aarch64
332b31808SJens Wiklander  *
432b31808SJens Wiklander  *  Copyright The Mbed TLS Contributors
5*b0563631STom Van Eyck  *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
632b31808SJens Wiklander  */
732b31808SJens Wiklander 
8*b0563631STom Van Eyck #if defined(__clang__) &&  (__clang_major__ >= 4)
9*b0563631STom Van Eyck 
10*b0563631STom Van Eyck /* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11*b0563631STom Van Eyck  * but that is defined by build_info.h, and we need this block to happen first. */
12*b0563631STom Van Eyck #if defined(__ARM_ARCH)
13*b0563631STom Van Eyck #if __ARM_ARCH >= 8
14*b0563631STom Van Eyck #define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15*b0563631STom Van Eyck #endif
16*b0563631STom Van Eyck #endif
17*b0563631STom Van Eyck 
18*b0563631STom Van Eyck #if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
1932b31808SJens Wiklander /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
2032b31808SJens Wiklander  *
2132b31808SJens Wiklander  * The intrinsic declaration are guarded by predefined ACLE macros in clang:
2232b31808SJens Wiklander  * these are normally only enabled by the -march option on the command line.
2332b31808SJens Wiklander  * By defining the macros ourselves we gain access to those declarations without
2432b31808SJens Wiklander  * requiring -march on the command line.
2532b31808SJens Wiklander  *
26*b0563631STom Van Eyck  * `arm_neon.h` is included by common.h, so we put these defines
2732b31808SJens Wiklander  * at the top of this file, before any includes.
2832b31808SJens Wiklander  */
2932b31808SJens Wiklander #define __ARM_FEATURE_CRYPTO 1
3032b31808SJens Wiklander /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
3132b31808SJens Wiklander  *
3232b31808SJens Wiklander  * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
3332b31808SJens Wiklander  * for older compilers.
3432b31808SJens Wiklander  */
3532b31808SJens Wiklander #define __ARM_FEATURE_AES    1
3632b31808SJens Wiklander #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
3732b31808SJens Wiklander #endif
3832b31808SJens Wiklander 
39*b0563631STom Van Eyck #endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
40*b0563631STom Van Eyck 
4132b31808SJens Wiklander #include <string.h>
4232b31808SJens Wiklander #include "common.h"
4332b31808SJens Wiklander 
4432b31808SJens Wiklander #if defined(MBEDTLS_AESCE_C)
4532b31808SJens Wiklander 
4632b31808SJens Wiklander #include "aesce.h"
4732b31808SJens Wiklander 
48*b0563631STom Van Eyck #if defined(MBEDTLS_AESCE_HAVE_CODE)
4932b31808SJens Wiklander 
50*b0563631STom Van Eyck /* Compiler version checks. */
5132b31808SJens Wiklander #if defined(__clang__)
52*b0563631STom Van Eyck #   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
53*b0563631STom Van Eyck #       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
54*b0563631STom Van Eyck #   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55*b0563631STom Van Eyck #       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
5632b31808SJens Wiklander #   endif
5732b31808SJens Wiklander #elif defined(__GNUC__)
5832b31808SJens Wiklander #   if __GNUC__ < 6
59*b0563631STom Van Eyck #       error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
6032b31808SJens Wiklander #   endif
61*b0563631STom Van Eyck #elif defined(_MSC_VER)
62*b0563631STom Van Eyck /* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63*b0563631STom Van Eyck  *       please update this and document of `MBEDTLS_AESCE_C` in
64*b0563631STom Van Eyck  *       `mbedtls_config.h`. */
65*b0563631STom Van Eyck #   if _MSC_VER < 1929
66*b0563631STom Van Eyck #       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67*b0563631STom Van Eyck #   endif
68*b0563631STom Van Eyck #elif defined(__ARMCC_VERSION)
69*b0563631STom Van Eyck #    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70*b0563631STom Van Eyck /* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71*b0563631STom Van Eyck  * If someone verified that, please update this and document of
72*b0563631STom Van Eyck  * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73*b0563631STom Van Eyck #         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74*b0563631STom Van Eyck #    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75*b0563631STom Van Eyck #         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76*b0563631STom Van Eyck #    endif
77*b0563631STom Van Eyck #endif
78*b0563631STom Van Eyck 
79*b0563631STom Van Eyck #if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
80*b0563631STom Van Eyck     defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
81*b0563631STom Van Eyck #   if defined(__ARMCOMPILER_VERSION)
82*b0563631STom Van Eyck #       if __ARMCOMPILER_VERSION <= 6090000
83*b0563631STom Van Eyck #           error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
8432b31808SJens Wiklander #       else
85*b0563631STom Van Eyck #           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
86*b0563631STom Van Eyck #           define MBEDTLS_POP_TARGET_PRAGMA
8732b31808SJens Wiklander #       endif
88*b0563631STom Van Eyck #   elif defined(__clang__)
89*b0563631STom Van Eyck #       pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
90*b0563631STom Van Eyck #       define MBEDTLS_POP_TARGET_PRAGMA
91*b0563631STom Van Eyck #   elif defined(__GNUC__)
92*b0563631STom Van Eyck #       pragma GCC push_options
93*b0563631STom Van Eyck #       pragma GCC target ("+crypto")
94*b0563631STom Van Eyck #       define MBEDTLS_POP_TARGET_PRAGMA
95*b0563631STom Van Eyck #   elif defined(_MSC_VER)
96*b0563631STom Van Eyck #       error "Required feature(__ARM_FEATURE_AES) is not enabled."
97*b0563631STom Van Eyck #   endif
98*b0563631STom Van Eyck #endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
99*b0563631STom Van Eyck           MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
10032b31808SJens Wiklander 
101*b0563631STom Van Eyck #if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
10232b31808SJens Wiklander 
10332b31808SJens Wiklander #include <sys/auxv.h>
104*b0563631STom Van Eyck #if !defined(HWCAP_NEON)
105*b0563631STom Van Eyck #define HWCAP_NEON  (1 << 12)
106*b0563631STom Van Eyck #endif
107*b0563631STom Van Eyck #if !defined(HWCAP2_AES)
108*b0563631STom Van Eyck #define HWCAP2_AES  (1 << 0)
109*b0563631STom Van Eyck #endif
110*b0563631STom Van Eyck #if !defined(HWCAP_AES)
111*b0563631STom Van Eyck #define HWCAP_AES   (1 << 3)
112*b0563631STom Van Eyck #endif
113*b0563631STom Van Eyck #if !defined(HWCAP_ASIMD)
114*b0563631STom Van Eyck #define HWCAP_ASIMD (1 << 1)
11532b31808SJens Wiklander #endif
11632b31808SJens Wiklander 
117*b0563631STom Van Eyck signed char mbedtls_aesce_has_support_result = -1;
118*b0563631STom Van Eyck 
119*b0563631STom Van Eyck #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
12032b31808SJens Wiklander /*
12132b31808SJens Wiklander  * AES instruction support detection routine
12232b31808SJens Wiklander  */
mbedtls_aesce_has_support_impl(void)123*b0563631STom Van Eyck int mbedtls_aesce_has_support_impl(void)
12432b31808SJens Wiklander {
125*b0563631STom Van Eyck     /* To avoid many calls to getauxval, cache the result. This is
126*b0563631STom Van Eyck      * thread-safe, because we store the result in a char so cannot
127*b0563631STom Van Eyck      * be vulnerable to non-atomic updates.
128*b0563631STom Van Eyck      * It is possible that we could end up setting result more than
129*b0563631STom Van Eyck      * once, but that is harmless.
130*b0563631STom Van Eyck      */
131*b0563631STom Van Eyck     if (mbedtls_aesce_has_support_result == -1) {
132*b0563631STom Van Eyck #if defined(MBEDTLS_ARCH_IS_ARM32)
13332b31808SJens Wiklander         unsigned long auxval  = getauxval(AT_HWCAP);
134*b0563631STom Van Eyck         unsigned long auxval2 = getauxval(AT_HWCAP2);
135*b0563631STom Van Eyck         if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
136*b0563631STom Van Eyck             ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
137*b0563631STom Van Eyck             mbedtls_aesce_has_support_result = 1;
138*b0563631STom Van Eyck         } else {
139*b0563631STom Van Eyck             mbedtls_aesce_has_support_result = 0;
140*b0563631STom Van Eyck         }
14132b31808SJens Wiklander #else
142*b0563631STom Van Eyck         unsigned long auxval = getauxval(AT_HWCAP);
143*b0563631STom Van Eyck         if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
144*b0563631STom Van Eyck             (HWCAP_ASIMD | HWCAP_AES)) {
145*b0563631STom Van Eyck             mbedtls_aesce_has_support_result = 1;
146*b0563631STom Van Eyck         } else {
147*b0563631STom Van Eyck             mbedtls_aesce_has_support_result = 0;
148*b0563631STom Van Eyck         }
14932b31808SJens Wiklander #endif
15032b31808SJens Wiklander     }
151*b0563631STom Van Eyck     return mbedtls_aesce_has_support_result;
152*b0563631STom Van Eyck }
153*b0563631STom Van Eyck #endif
15432b31808SJens Wiklander 
155*b0563631STom Van Eyck #endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
156*b0563631STom Van Eyck 
157*b0563631STom Van Eyck /* Single round of AESCE encryption */
158*b0563631STom Van Eyck #define AESCE_ENCRYPT_ROUND                   \
159*b0563631STom Van Eyck     block = vaeseq_u8(block, vld1q_u8(keys)); \
160*b0563631STom Van Eyck     block = vaesmcq_u8(block);                \
161*b0563631STom Van Eyck     keys += 16
162*b0563631STom Van Eyck /* Two rounds of AESCE encryption */
163*b0563631STom Van Eyck #define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
164*b0563631STom Van Eyck 
165*b0563631STom Van Eyck MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
aesce_encrypt_block(uint8x16_t block,unsigned char * keys,int rounds)16632b31808SJens Wiklander static uint8x16_t aesce_encrypt_block(uint8x16_t block,
16732b31808SJens Wiklander                                       unsigned char *keys,
16832b31808SJens Wiklander                                       int rounds)
16932b31808SJens Wiklander {
170*b0563631STom Van Eyck     /* 10, 12 or 14 rounds. Unroll loop. */
171*b0563631STom Van Eyck     if (rounds == 10) {
172*b0563631STom Van Eyck         goto rounds_10;
17332b31808SJens Wiklander     }
174*b0563631STom Van Eyck     if (rounds == 12) {
175*b0563631STom Van Eyck         goto rounds_12;
176*b0563631STom Van Eyck     }
177*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND_X2;
178*b0563631STom Van Eyck rounds_12:
179*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND_X2;
180*b0563631STom Van Eyck rounds_10:
181*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND_X2;
182*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND_X2;
183*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND_X2;
184*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND_X2;
185*b0563631STom Van Eyck     AESCE_ENCRYPT_ROUND;
18632b31808SJens Wiklander 
18732b31808SJens Wiklander     /* AES AddRoundKey for the previous round.
18832b31808SJens Wiklander      * SubBytes, ShiftRows for the final round.  */
189*b0563631STom Van Eyck     block = vaeseq_u8(block, vld1q_u8(keys));
190*b0563631STom Van Eyck     keys += 16;
19132b31808SJens Wiklander 
19232b31808SJens Wiklander     /* Final round: no MixColumns */
19332b31808SJens Wiklander 
19432b31808SJens Wiklander     /* Final AddRoundKey */
195*b0563631STom Van Eyck     block = veorq_u8(block, vld1q_u8(keys));
19632b31808SJens Wiklander 
19732b31808SJens Wiklander     return block;
19832b31808SJens Wiklander }
19932b31808SJens Wiklander 
200*b0563631STom Van Eyck /* Single round of AESCE decryption
201*b0563631STom Van Eyck  *
202*b0563631STom Van Eyck  * AES AddRoundKey, SubBytes, ShiftRows
203*b0563631STom Van Eyck  *
204*b0563631STom Van Eyck  *      block = vaesdq_u8(block, vld1q_u8(keys));
205*b0563631STom Van Eyck  *
206*b0563631STom Van Eyck  * AES inverse MixColumns for the next round.
20732b31808SJens Wiklander  *
20832b31808SJens Wiklander  * This means that we switch the order of the inverse AddRoundKey and
20932b31808SJens Wiklander  * inverse MixColumns operations. We have to do this as AddRoundKey is
21032b31808SJens Wiklander  * done in an atomic instruction together with the inverses of SubBytes
21132b31808SJens Wiklander  * and ShiftRows.
21232b31808SJens Wiklander  *
21332b31808SJens Wiklander  * It works because MixColumns is a linear operation over GF(2^8) and
21432b31808SJens Wiklander  * AddRoundKey is an exclusive or, which is equivalent to addition over
21532b31808SJens Wiklander  * GF(2^8). (The inverse of MixColumns needs to be applied to the
21632b31808SJens Wiklander  * affected round keys separately which has been done when the
217*b0563631STom Van Eyck  * decryption round keys were calculated.)
218*b0563631STom Van Eyck  *
219*b0563631STom Van Eyck  *      block = vaesimcq_u8(block);
220*b0563631STom Van Eyck  */
221*b0563631STom Van Eyck #define AESCE_DECRYPT_ROUND                   \
222*b0563631STom Van Eyck     block = vaesdq_u8(block, vld1q_u8(keys)); \
223*b0563631STom Van Eyck     block = vaesimcq_u8(block);               \
224*b0563631STom Van Eyck     keys += 16
225*b0563631STom Van Eyck /* Two rounds of AESCE decryption */
226*b0563631STom Van Eyck #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
227*b0563631STom Van Eyck 
228*b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
aesce_decrypt_block(uint8x16_t block,unsigned char * keys,int rounds)229*b0563631STom Van Eyck static uint8x16_t aesce_decrypt_block(uint8x16_t block,
230*b0563631STom Van Eyck                                       unsigned char *keys,
231*b0563631STom Van Eyck                                       int rounds)
232*b0563631STom Van Eyck {
233*b0563631STom Van Eyck     /* 10, 12 or 14 rounds. Unroll loop. */
234*b0563631STom Van Eyck     if (rounds == 10) {
235*b0563631STom Van Eyck         goto rounds_10;
23632b31808SJens Wiklander     }
237*b0563631STom Van Eyck     if (rounds == 12) {
238*b0563631STom Van Eyck         goto rounds_12;
239*b0563631STom Van Eyck     }
240*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND_X2;
241*b0563631STom Van Eyck rounds_12:
242*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND_X2;
243*b0563631STom Van Eyck rounds_10:
244*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND_X2;
245*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND_X2;
246*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND_X2;
247*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND_X2;
248*b0563631STom Van Eyck     AESCE_DECRYPT_ROUND;
24932b31808SJens Wiklander 
25032b31808SJens Wiklander     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
25132b31808SJens Wiklander      * last full round. */
252*b0563631STom Van Eyck     block = vaesdq_u8(block, vld1q_u8(keys));
253*b0563631STom Van Eyck     keys += 16;
25432b31808SJens Wiklander 
25532b31808SJens Wiklander     /* Inverse AddRoundKey for inverting the initial round key addition. */
256*b0563631STom Van Eyck     block = veorq_u8(block, vld1q_u8(keys));
25732b31808SJens Wiklander 
25832b31808SJens Wiklander     return block;
25932b31808SJens Wiklander }
260*b0563631STom Van Eyck #endif
26132b31808SJens Wiklander 
26232b31808SJens Wiklander /*
26332b31808SJens Wiklander  * AES-ECB block en(de)cryption
26432b31808SJens Wiklander  */
mbedtls_aesce_crypt_ecb(mbedtls_aes_context * ctx,int mode,const unsigned char input[16],unsigned char output[16])26532b31808SJens Wiklander int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
26632b31808SJens Wiklander                             int mode,
26732b31808SJens Wiklander                             const unsigned char input[16],
26832b31808SJens Wiklander                             unsigned char output[16])
26932b31808SJens Wiklander {
27032b31808SJens Wiklander     uint8x16_t block = vld1q_u8(&input[0]);
27132b31808SJens Wiklander     unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
27232b31808SJens Wiklander 
273*b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
274*b0563631STom Van Eyck     if (mode == MBEDTLS_AES_DECRYPT) {
27532b31808SJens Wiklander         block = aesce_decrypt_block(block, keys, ctx->nr);
276*b0563631STom Van Eyck     } else
277*b0563631STom Van Eyck #else
278*b0563631STom Van Eyck     (void) mode;
279*b0563631STom Van Eyck #endif
280*b0563631STom Van Eyck     {
281*b0563631STom Van Eyck         block = aesce_encrypt_block(block, keys, ctx->nr);
28232b31808SJens Wiklander     }
28332b31808SJens Wiklander     vst1q_u8(&output[0], block);
28432b31808SJens Wiklander 
28532b31808SJens Wiklander     return 0;
28632b31808SJens Wiklander }
28732b31808SJens Wiklander 
28832b31808SJens Wiklander /*
28932b31808SJens Wiklander  * Compute decryption round keys from encryption round keys
29032b31808SJens Wiklander  */
291*b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
mbedtls_aesce_inverse_key(unsigned char * invkey,const unsigned char * fwdkey,int nr)29232b31808SJens Wiklander void mbedtls_aesce_inverse_key(unsigned char *invkey,
29332b31808SJens Wiklander                                const unsigned char *fwdkey,
29432b31808SJens Wiklander                                int nr)
29532b31808SJens Wiklander {
29632b31808SJens Wiklander     int i, j;
29732b31808SJens Wiklander     j = nr;
29832b31808SJens Wiklander     vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
29932b31808SJens Wiklander     for (i = 1, j--; j > 0; i++, j--) {
30032b31808SJens Wiklander         vst1q_u8(invkey + i * 16,
30132b31808SJens Wiklander                  vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
30232b31808SJens Wiklander     }
30332b31808SJens Wiklander     vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
30432b31808SJens Wiklander 
30532b31808SJens Wiklander }
306*b0563631STom Van Eyck #endif
30732b31808SJens Wiklander 
aes_rot_word(uint32_t word)30832b31808SJens Wiklander static inline uint32_t aes_rot_word(uint32_t word)
30932b31808SJens Wiklander {
31032b31808SJens Wiklander     return (word << (32 - 8)) | (word >> 8);
31132b31808SJens Wiklander }
31232b31808SJens Wiklander 
aes_sub_word(uint32_t in)31332b31808SJens Wiklander static inline uint32_t aes_sub_word(uint32_t in)
31432b31808SJens Wiklander {
31532b31808SJens Wiklander     uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
31632b31808SJens Wiklander     uint8x16_t zero = vdupq_n_u8(0);
31732b31808SJens Wiklander 
31832b31808SJens Wiklander     /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
31932b31808SJens Wiklander      * the correct result as ShiftRows doesn't change the first row. */
32032b31808SJens Wiklander     v = vaeseq_u8(zero, v);
32132b31808SJens Wiklander     return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
32232b31808SJens Wiklander }
32332b31808SJens Wiklander 
32432b31808SJens Wiklander /*
32532b31808SJens Wiklander  * Key expansion function
32632b31808SJens Wiklander  */
aesce_setkey_enc(unsigned char * rk,const unsigned char * key,const size_t key_bit_length)32732b31808SJens Wiklander static void aesce_setkey_enc(unsigned char *rk,
32832b31808SJens Wiklander                              const unsigned char *key,
32932b31808SJens Wiklander                              const size_t key_bit_length)
33032b31808SJens Wiklander {
33132b31808SJens Wiklander     static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
33232b31808SJens Wiklander                                     0x20, 0x40, 0x80, 0x1b, 0x36 };
33332b31808SJens Wiklander     /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
33432b31808SJens Wiklander      *   - Section 5, Nr = Nk + 6
33532b31808SJens Wiklander      *   - Section 5.2, the length of round keys is Nb*(Nr+1)
33632b31808SJens Wiklander      */
337*b0563631STom Van Eyck     const size_t key_len_in_words = key_bit_length / 32;    /* Nk */
33832b31808SJens Wiklander     const size_t round_key_len_in_words = 4;                /* Nb */
33932b31808SJens Wiklander     const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
34032b31808SJens Wiklander     const size_t round_keys_len_in_words =
34132b31808SJens Wiklander         round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
34232b31808SJens Wiklander     const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
34332b31808SJens Wiklander 
34432b31808SJens Wiklander     memcpy(rk, key, key_len_in_words * 4);
34532b31808SJens Wiklander 
34632b31808SJens Wiklander     for (uint32_t *rki = (uint32_t *) rk;
34732b31808SJens Wiklander          rki + key_len_in_words < rko_end;
34832b31808SJens Wiklander          rki += key_len_in_words) {
34932b31808SJens Wiklander 
350*b0563631STom Van Eyck         size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
35132b31808SJens Wiklander         uint32_t *rko;
35232b31808SJens Wiklander         rko = rki + key_len_in_words;
35332b31808SJens Wiklander         rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
35432b31808SJens Wiklander         rko[0] ^= rcon[iteration] ^ rki[0];
35532b31808SJens Wiklander         rko[1] = rko[0] ^ rki[1];
35632b31808SJens Wiklander         rko[2] = rko[1] ^ rki[2];
35732b31808SJens Wiklander         rko[3] = rko[2] ^ rki[3];
35832b31808SJens Wiklander         if (rko + key_len_in_words > rko_end) {
35932b31808SJens Wiklander             /* Do not write overflow words.*/
36032b31808SJens Wiklander             continue;
36132b31808SJens Wiklander         }
362*b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
36332b31808SJens Wiklander         switch (key_bit_length) {
36432b31808SJens Wiklander             case 128:
36532b31808SJens Wiklander                 break;
36632b31808SJens Wiklander             case 192:
36732b31808SJens Wiklander                 rko[4] = rko[3] ^ rki[4];
36832b31808SJens Wiklander                 rko[5] = rko[4] ^ rki[5];
36932b31808SJens Wiklander                 break;
37032b31808SJens Wiklander             case 256:
37132b31808SJens Wiklander                 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
37232b31808SJens Wiklander                 rko[5] = rko[4] ^ rki[5];
37332b31808SJens Wiklander                 rko[6] = rko[5] ^ rki[6];
37432b31808SJens Wiklander                 rko[7] = rko[6] ^ rki[7];
37532b31808SJens Wiklander                 break;
37632b31808SJens Wiklander         }
377*b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
37832b31808SJens Wiklander     }
37932b31808SJens Wiklander }
38032b31808SJens Wiklander 
38132b31808SJens Wiklander /*
38232b31808SJens Wiklander  * Key expansion, wrapper
38332b31808SJens Wiklander  */
mbedtls_aesce_setkey_enc(unsigned char * rk,const unsigned char * key,size_t bits)38432b31808SJens Wiklander int mbedtls_aesce_setkey_enc(unsigned char *rk,
38532b31808SJens Wiklander                              const unsigned char *key,
38632b31808SJens Wiklander                              size_t bits)
38732b31808SJens Wiklander {
38832b31808SJens Wiklander     switch (bits) {
38932b31808SJens Wiklander         case 128:
39032b31808SJens Wiklander         case 192:
39132b31808SJens Wiklander         case 256:
39232b31808SJens Wiklander             aesce_setkey_enc(rk, key, bits);
39332b31808SJens Wiklander             break;
39432b31808SJens Wiklander         default:
39532b31808SJens Wiklander             return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
39632b31808SJens Wiklander     }
39732b31808SJens Wiklander 
39832b31808SJens Wiklander     return 0;
39932b31808SJens Wiklander }
40032b31808SJens Wiklander 
40132b31808SJens Wiklander #if defined(MBEDTLS_GCM_C)
40232b31808SJens Wiklander 
403*b0563631STom Van Eyck #if defined(MBEDTLS_ARCH_IS_ARM32)
404*b0563631STom Van Eyck 
405*b0563631STom Van Eyck #if defined(__clang__)
406*b0563631STom Van Eyck /* On clang for A32/T32, work around some missing intrinsics and types which are listed in
407*b0563631STom Van Eyck  * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
408*b0563631STom Van Eyck  * These are only required for GCM.
409*b0563631STom Van Eyck  */
410*b0563631STom Van Eyck #define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
411*b0563631STom Van Eyck 
412*b0563631STom Van Eyck typedef uint8x16_t poly128_t;
413*b0563631STom Van Eyck 
vmull_p64(poly64_t a,poly64_t b)414*b0563631STom Van Eyck static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
415*b0563631STom Van Eyck {
416*b0563631STom Van Eyck     poly128_t r;
417*b0563631STom Van Eyck     asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
418*b0563631STom Van Eyck     return r;
419*b0563631STom Van Eyck }
420*b0563631STom Van Eyck 
421*b0563631STom Van Eyck /* This is set to cause some more missing intrinsics to be defined below */
422*b0563631STom Van Eyck #define COMMON_MISSING_INTRINSICS
423*b0563631STom Van Eyck 
vmull_high_p64(poly64x2_t a,poly64x2_t b)424*b0563631STom Van Eyck static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
425*b0563631STom Van Eyck {
426*b0563631STom Van Eyck     return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
427*b0563631STom Van Eyck                      (poly64_t) (vget_high_u64((uint64x2_t) b)));
428*b0563631STom Van Eyck }
429*b0563631STom Van Eyck 
430*b0563631STom Van Eyck #endif /* defined(__clang__) */
431*b0563631STom Van Eyck 
vrbitq_u8(uint8x16_t x)432*b0563631STom Van Eyck static inline uint8x16_t vrbitq_u8(uint8x16_t x)
433*b0563631STom Van Eyck {
434*b0563631STom Van Eyck     /* There is no vrbitq_u8 instruction in A32/T32, so provide
435*b0563631STom Van Eyck      * an equivalent non-Neon implementation. Reverse bit order in each
436*b0563631STom Van Eyck      * byte with 4x rbit, rev. */
437*b0563631STom Van Eyck     asm ("ldm  %[p], { r2-r5 } \n\t"
438*b0563631STom Van Eyck          "rbit r2, r2          \n\t"
439*b0563631STom Van Eyck          "rev  r2, r2          \n\t"
440*b0563631STom Van Eyck          "rbit r3, r3          \n\t"
441*b0563631STom Van Eyck          "rev  r3, r3          \n\t"
442*b0563631STom Van Eyck          "rbit r4, r4          \n\t"
443*b0563631STom Van Eyck          "rev  r4, r4          \n\t"
444*b0563631STom Van Eyck          "rbit r5, r5          \n\t"
445*b0563631STom Van Eyck          "rev  r5, r5          \n\t"
446*b0563631STom Van Eyck          "stm  %[p], { r2-r5 } \n\t"
447*b0563631STom Van Eyck          :
448*b0563631STom Van Eyck          /* Output: 16 bytes of memory pointed to by &x */
449*b0563631STom Van Eyck          "+m" (*(uint8_t(*)[16]) &x)
450*b0563631STom Van Eyck          :
451*b0563631STom Van Eyck          [p] "r" (&x)
452*b0563631STom Van Eyck          :
453*b0563631STom Van Eyck          "r2", "r3", "r4", "r5"
454*b0563631STom Van Eyck          );
455*b0563631STom Van Eyck     return x;
456*b0563631STom Van Eyck }
457*b0563631STom Van Eyck 
458*b0563631STom Van Eyck #endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
459*b0563631STom Van Eyck 
460*b0563631STom Van Eyck #if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
46132b31808SJens Wiklander /* Some intrinsics are not available for GCC 5.X. */
462*b0563631STom Van Eyck #define COMMON_MISSING_INTRINSICS
463*b0563631STom Van Eyck #endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
464*b0563631STom Van Eyck 
465*b0563631STom Van Eyck 
466*b0563631STom Van Eyck #if defined(COMMON_MISSING_INTRINSICS)
467*b0563631STom Van Eyck 
468*b0563631STom Van Eyck /* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
469*b0563631STom Van Eyck 
47032b31808SJens Wiklander #define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
47132b31808SJens Wiklander #define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
472*b0563631STom Van Eyck 
vget_low_p64(poly64x2_t a)473*b0563631STom Van Eyck static inline poly64x1_t vget_low_p64(poly64x2_t a)
47432b31808SJens Wiklander {
475*b0563631STom Van Eyck     uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
476*b0563631STom Van Eyck     return (poly64x1_t) r;
477*b0563631STom Van Eyck 
47832b31808SJens Wiklander }
479*b0563631STom Van Eyck 
480*b0563631STom Van Eyck #endif /* COMMON_MISSING_INTRINSICS */
48132b31808SJens Wiklander 
48232b31808SJens Wiklander /* vmull_p64/vmull_high_p64 wrappers.
48332b31808SJens Wiklander  *
48432b31808SJens Wiklander  * Older compilers miss some intrinsic functions for `poly*_t`. We use
48532b31808SJens Wiklander  * uint8x16_t and uint8x16x3_t as input/output parameters.
48632b31808SJens Wiklander  */
487*b0563631STom Van Eyck #if defined(MBEDTLS_COMPILER_IS_GCC)
488*b0563631STom Van Eyck /* GCC reports incompatible type error without cast. GCC think poly64_t and
489*b0563631STom Van Eyck  * poly64x1_t are different, that is different with MSVC and Clang. */
490*b0563631STom Van Eyck #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
491*b0563631STom Van Eyck #else
492*b0563631STom Van Eyck /* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
493*b0563631STom Van Eyck  * error with/without cast. And I think poly64_t and poly64x1_t are same, no
494*b0563631STom Van Eyck  * cast for clang also. */
495*b0563631STom Van Eyck #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
496*b0563631STom Van Eyck #endif /* MBEDTLS_COMPILER_IS_GCC */
497*b0563631STom Van Eyck 
pmull_low(uint8x16_t a,uint8x16_t b)49832b31808SJens Wiklander static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
49932b31808SJens Wiklander {
500*b0563631STom Van Eyck 
50132b31808SJens Wiklander     return vreinterpretq_u8_p128(
502*b0563631STom Van Eyck         MBEDTLS_VMULL_P64(
50332b31808SJens Wiklander             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
504*b0563631STom Van Eyck             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
505*b0563631STom Van Eyck             ));
50632b31808SJens Wiklander }
50732b31808SJens Wiklander 
pmull_high(uint8x16_t a,uint8x16_t b)50832b31808SJens Wiklander static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
50932b31808SJens Wiklander {
51032b31808SJens Wiklander     return vreinterpretq_u8_p128(
51132b31808SJens Wiklander         vmull_high_p64(vreinterpretq_p64_u8(a),
51232b31808SJens Wiklander                        vreinterpretq_p64_u8(b)));
51332b31808SJens Wiklander }
51432b31808SJens Wiklander 
51532b31808SJens Wiklander /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
51632b31808SJens Wiklander  * `x^128 + x^7 + x^2 + x + 1`.
51732b31808SJens Wiklander  *
51832b31808SJens Wiklander  * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
51932b31808SJens Wiklander  * multiplies to generate a 128b.
52032b31808SJens Wiklander  *
52132b31808SJens Wiklander  * `poly_mult_128` executes polynomial multiplication and outputs 256b that
52232b31808SJens Wiklander  * represented by 3 128b due to code size optimization.
52332b31808SJens Wiklander  *
52432b31808SJens Wiklander  * Output layout:
52532b31808SJens Wiklander  * |            |             |             |
52632b31808SJens Wiklander  * |------------|-------------|-------------|
52732b31808SJens Wiklander  * | ret.val[0] | h3:h2:00:00 | high   128b |
52832b31808SJens Wiklander  * | ret.val[1] |   :m2:m1:00 | middle 128b |
52932b31808SJens Wiklander  * | ret.val[2] |   :  :l1:l0 | low    128b |
53032b31808SJens Wiklander  */
poly_mult_128(uint8x16_t a,uint8x16_t b)53132b31808SJens Wiklander static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
53232b31808SJens Wiklander {
53332b31808SJens Wiklander     uint8x16x3_t ret;
53432b31808SJens Wiklander     uint8x16_t h, m, l; /* retval high/middle/low */
53532b31808SJens Wiklander     uint8x16_t c, d, e;
53632b31808SJens Wiklander 
53732b31808SJens Wiklander     h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
53832b31808SJens Wiklander     l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
53932b31808SJens Wiklander     c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
54032b31808SJens Wiklander     d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
54132b31808SJens Wiklander     e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
54232b31808SJens Wiklander     m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
54332b31808SJens Wiklander 
54432b31808SJens Wiklander     ret.val[0] = h;
54532b31808SJens Wiklander     ret.val[1] = m;
54632b31808SJens Wiklander     ret.val[2] = l;
54732b31808SJens Wiklander     return ret;
54832b31808SJens Wiklander }
54932b31808SJens Wiklander 
55032b31808SJens Wiklander /*
55132b31808SJens Wiklander  * Modulo reduction.
55232b31808SJens Wiklander  *
55332b31808SJens Wiklander  * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
55432b31808SJens Wiklander  *
55532b31808SJens Wiklander  * Section 4.3
55632b31808SJens Wiklander  *
55732b31808SJens Wiklander  * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
55832b31808SJens Wiklander  * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
55932b31808SJens Wiklander  * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
56032b31808SJens Wiklander  * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
56132b31808SJens Wiklander  * simply multiply the higher part of the operand by r(z) and add it to l(z). If
56232b31808SJens Wiklander  * the result is still larger than 128 bits, we reduce again.
56332b31808SJens Wiklander  */
poly_mult_reduce(uint8x16x3_t input)56432b31808SJens Wiklander static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
56532b31808SJens Wiklander {
56632b31808SJens Wiklander     uint8x16_t const ZERO = vdupq_n_u8(0);
567*b0563631STom Van Eyck 
56832b31808SJens Wiklander     uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
569*b0563631STom Van Eyck #if defined(__GNUC__)
570*b0563631STom Van Eyck     /* use 'asm' as an optimisation barrier to prevent loading MODULO from
571*b0563631STom Van Eyck      * memory. It is for GNUC compatible compilers.
572*b0563631STom Van Eyck      */
573*b0563631STom Van Eyck     asm volatile ("" : "+w" (r));
574*b0563631STom Van Eyck #endif
57532b31808SJens Wiklander     uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
57632b31808SJens Wiklander     uint8x16_t h, m, l; /* input high/middle/low 128b */
57732b31808SJens Wiklander     uint8x16_t c, d, e, f, g, n, o;
57832b31808SJens Wiklander     h = input.val[0];            /* h3:h2:00:00                          */
57932b31808SJens Wiklander     m = input.val[1];            /*   :m2:m1:00                          */
58032b31808SJens Wiklander     l = input.val[2];            /*   :  :l1:l0                          */
58132b31808SJens Wiklander     c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
58232b31808SJens Wiklander     d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
58332b31808SJens Wiklander     e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
58432b31808SJens Wiklander     f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
58532b31808SJens Wiklander     g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
58632b31808SJens Wiklander     n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
58732b31808SJens Wiklander     o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
58832b31808SJens Wiklander     return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
58932b31808SJens Wiklander }
59032b31808SJens Wiklander 
59132b31808SJens Wiklander /*
59232b31808SJens Wiklander  * GCM multiplication: c = a times b in GF(2^128)
59332b31808SJens Wiklander  */
mbedtls_aesce_gcm_mult(unsigned char c[16],const unsigned char a[16],const unsigned char b[16])59432b31808SJens Wiklander void mbedtls_aesce_gcm_mult(unsigned char c[16],
59532b31808SJens Wiklander                             const unsigned char a[16],
59632b31808SJens Wiklander                             const unsigned char b[16])
59732b31808SJens Wiklander {
59832b31808SJens Wiklander     uint8x16_t va, vb, vc;
59932b31808SJens Wiklander     va = vrbitq_u8(vld1q_u8(&a[0]));
60032b31808SJens Wiklander     vb = vrbitq_u8(vld1q_u8(&b[0]));
60132b31808SJens Wiklander     vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
60232b31808SJens Wiklander     vst1q_u8(&c[0], vc);
60332b31808SJens Wiklander }
60432b31808SJens Wiklander 
60532b31808SJens Wiklander #endif /* MBEDTLS_GCM_C */
60632b31808SJens Wiklander 
60732b31808SJens Wiklander #if defined(MBEDTLS_POP_TARGET_PRAGMA)
60832b31808SJens Wiklander #if defined(__clang__)
60932b31808SJens Wiklander #pragma clang attribute pop
61032b31808SJens Wiklander #elif defined(__GNUC__)
61132b31808SJens Wiklander #pragma GCC pop_options
61232b31808SJens Wiklander #endif
61332b31808SJens Wiklander #undef MBEDTLS_POP_TARGET_PRAGMA
61432b31808SJens Wiklander #endif
61532b31808SJens Wiklander 
616*b0563631STom Van Eyck #endif /* MBEDTLS_AESCE_HAVE_CODE */
61732b31808SJens Wiklander 
61832b31808SJens Wiklander #endif /* MBEDTLS_AESCE_C */
619