1*32b31808SJens Wiklander /* 2*32b31808SJens Wiklander * Armv8-A Cryptographic Extension support functions for Aarch64 3*32b31808SJens Wiklander * 4*32b31808SJens Wiklander * Copyright The Mbed TLS Contributors 5*32b31808SJens Wiklander * SPDX-License-Identifier: Apache-2.0 6*32b31808SJens Wiklander * 7*32b31808SJens Wiklander * Licensed under the Apache License, Version 2.0 (the "License"); you may 8*32b31808SJens Wiklander * not use this file except in compliance with the License. 9*32b31808SJens Wiklander * You may obtain a copy of the License at 10*32b31808SJens Wiklander * 11*32b31808SJens Wiklander * http://www.apache.org/licenses/LICENSE-2.0 12*32b31808SJens Wiklander * 13*32b31808SJens Wiklander * Unless required by applicable law or agreed to in writing, software 14*32b31808SJens Wiklander * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 15*32b31808SJens Wiklander * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16*32b31808SJens Wiklander * See the License for the specific language governing permissions and 17*32b31808SJens Wiklander * limitations under the License. 18*32b31808SJens Wiklander */ 19*32b31808SJens Wiklander 20*32b31808SJens Wiklander #if defined(__aarch64__) && !defined(__ARM_FEATURE_CRYPTO) && \ 21*32b31808SJens Wiklander defined(__clang__) && __clang_major__ >= 4 22*32b31808SJens Wiklander /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged. 23*32b31808SJens Wiklander * 24*32b31808SJens Wiklander * The intrinsic declaration are guarded by predefined ACLE macros in clang: 25*32b31808SJens Wiklander * these are normally only enabled by the -march option on the command line. 26*32b31808SJens Wiklander * By defining the macros ourselves we gain access to those declarations without 27*32b31808SJens Wiklander * requiring -march on the command line. 28*32b31808SJens Wiklander * 29*32b31808SJens Wiklander * `arm_neon.h` could be included by any header file, so we put these defines 30*32b31808SJens Wiklander * at the top of this file, before any includes. 31*32b31808SJens Wiklander */ 32*32b31808SJens Wiklander #define __ARM_FEATURE_CRYPTO 1 33*32b31808SJens Wiklander /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions 34*32b31808SJens Wiklander * 35*32b31808SJens Wiklander * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it 36*32b31808SJens Wiklander * for older compilers. 37*32b31808SJens Wiklander */ 38*32b31808SJens Wiklander #define __ARM_FEATURE_AES 1 39*32b31808SJens Wiklander #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG 40*32b31808SJens Wiklander #endif 41*32b31808SJens Wiklander 42*32b31808SJens Wiklander #include <string.h> 43*32b31808SJens Wiklander #include "common.h" 44*32b31808SJens Wiklander 45*32b31808SJens Wiklander #if defined(MBEDTLS_AESCE_C) 46*32b31808SJens Wiklander 47*32b31808SJens Wiklander #include "aesce.h" 48*32b31808SJens Wiklander 49*32b31808SJens Wiklander #if defined(MBEDTLS_HAVE_ARM64) 50*32b31808SJens Wiklander 51*32b31808SJens Wiklander #if !defined(__ARM_FEATURE_AES) || defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG) 52*32b31808SJens Wiklander # if defined(__clang__) 53*32b31808SJens Wiklander # if __clang_major__ < 4 54*32b31808SJens Wiklander # error "A more recent Clang is required for MBEDTLS_AESCE_C" 55*32b31808SJens Wiklander # endif 56*32b31808SJens Wiklander # pragma clang attribute push (__attribute__((target("crypto"))), apply_to=function) 57*32b31808SJens Wiklander # define MBEDTLS_POP_TARGET_PRAGMA 58*32b31808SJens Wiklander # elif defined(__GNUC__) 59*32b31808SJens Wiklander # if __GNUC__ < 6 60*32b31808SJens Wiklander # error "A more recent GCC is required for MBEDTLS_AESCE_C" 61*32b31808SJens Wiklander # endif 62*32b31808SJens Wiklander # pragma GCC push_options 63*32b31808SJens Wiklander # pragma GCC target ("arch=armv8-a+crypto") 64*32b31808SJens Wiklander # define MBEDTLS_POP_TARGET_PRAGMA 65*32b31808SJens Wiklander # else 66*32b31808SJens Wiklander # error "Only GCC and Clang supported for MBEDTLS_AESCE_C" 67*32b31808SJens Wiklander # endif 68*32b31808SJens Wiklander #endif /* !__ARM_FEATURE_AES || MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */ 69*32b31808SJens Wiklander 70*32b31808SJens Wiklander #include <arm_neon.h> 71*32b31808SJens Wiklander 72*32b31808SJens Wiklander #if defined(__linux__) 73*32b31808SJens Wiklander #include <asm/hwcap.h> 74*32b31808SJens Wiklander #include <sys/auxv.h> 75*32b31808SJens Wiklander #endif 76*32b31808SJens Wiklander 77*32b31808SJens Wiklander /* 78*32b31808SJens Wiklander * AES instruction support detection routine 79*32b31808SJens Wiklander */ 80*32b31808SJens Wiklander int mbedtls_aesce_has_support(void) 81*32b31808SJens Wiklander { 82*32b31808SJens Wiklander #if defined(__linux__) 83*32b31808SJens Wiklander unsigned long auxval = getauxval(AT_HWCAP); 84*32b31808SJens Wiklander return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == 85*32b31808SJens Wiklander (HWCAP_ASIMD | HWCAP_AES); 86*32b31808SJens Wiklander #else 87*32b31808SJens Wiklander /* Assume AES instructions are supported. */ 88*32b31808SJens Wiklander return 1; 89*32b31808SJens Wiklander #endif 90*32b31808SJens Wiklander } 91*32b31808SJens Wiklander 92*32b31808SJens Wiklander static uint8x16_t aesce_encrypt_block(uint8x16_t block, 93*32b31808SJens Wiklander unsigned char *keys, 94*32b31808SJens Wiklander int rounds) 95*32b31808SJens Wiklander { 96*32b31808SJens Wiklander for (int i = 0; i < rounds - 1; i++) { 97*32b31808SJens Wiklander /* AES AddRoundKey, SubBytes, ShiftRows (in this order). 98*32b31808SJens Wiklander * AddRoundKey adds the round key for the previous round. */ 99*32b31808SJens Wiklander block = vaeseq_u8(block, vld1q_u8(keys + i * 16)); 100*32b31808SJens Wiklander /* AES mix columns */ 101*32b31808SJens Wiklander block = vaesmcq_u8(block); 102*32b31808SJens Wiklander } 103*32b31808SJens Wiklander 104*32b31808SJens Wiklander /* AES AddRoundKey for the previous round. 105*32b31808SJens Wiklander * SubBytes, ShiftRows for the final round. */ 106*32b31808SJens Wiklander block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16)); 107*32b31808SJens Wiklander 108*32b31808SJens Wiklander /* Final round: no MixColumns */ 109*32b31808SJens Wiklander 110*32b31808SJens Wiklander /* Final AddRoundKey */ 111*32b31808SJens Wiklander block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); 112*32b31808SJens Wiklander 113*32b31808SJens Wiklander return block; 114*32b31808SJens Wiklander } 115*32b31808SJens Wiklander 116*32b31808SJens Wiklander static uint8x16_t aesce_decrypt_block(uint8x16_t block, 117*32b31808SJens Wiklander unsigned char *keys, 118*32b31808SJens Wiklander int rounds) 119*32b31808SJens Wiklander { 120*32b31808SJens Wiklander 121*32b31808SJens Wiklander for (int i = 0; i < rounds - 1; i++) { 122*32b31808SJens Wiklander /* AES AddRoundKey, SubBytes, ShiftRows */ 123*32b31808SJens Wiklander block = vaesdq_u8(block, vld1q_u8(keys + i * 16)); 124*32b31808SJens Wiklander /* AES inverse MixColumns for the next round. 125*32b31808SJens Wiklander * 126*32b31808SJens Wiklander * This means that we switch the order of the inverse AddRoundKey and 127*32b31808SJens Wiklander * inverse MixColumns operations. We have to do this as AddRoundKey is 128*32b31808SJens Wiklander * done in an atomic instruction together with the inverses of SubBytes 129*32b31808SJens Wiklander * and ShiftRows. 130*32b31808SJens Wiklander * 131*32b31808SJens Wiklander * It works because MixColumns is a linear operation over GF(2^8) and 132*32b31808SJens Wiklander * AddRoundKey is an exclusive or, which is equivalent to addition over 133*32b31808SJens Wiklander * GF(2^8). (The inverse of MixColumns needs to be applied to the 134*32b31808SJens Wiklander * affected round keys separately which has been done when the 135*32b31808SJens Wiklander * decryption round keys were calculated.) */ 136*32b31808SJens Wiklander block = vaesimcq_u8(block); 137*32b31808SJens Wiklander } 138*32b31808SJens Wiklander 139*32b31808SJens Wiklander /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the 140*32b31808SJens Wiklander * last full round. */ 141*32b31808SJens Wiklander block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16)); 142*32b31808SJens Wiklander 143*32b31808SJens Wiklander /* Inverse AddRoundKey for inverting the initial round key addition. */ 144*32b31808SJens Wiklander block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); 145*32b31808SJens Wiklander 146*32b31808SJens Wiklander return block; 147*32b31808SJens Wiklander } 148*32b31808SJens Wiklander 149*32b31808SJens Wiklander /* 150*32b31808SJens Wiklander * AES-ECB block en(de)cryption 151*32b31808SJens Wiklander */ 152*32b31808SJens Wiklander int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx, 153*32b31808SJens Wiklander int mode, 154*32b31808SJens Wiklander const unsigned char input[16], 155*32b31808SJens Wiklander unsigned char output[16]) 156*32b31808SJens Wiklander { 157*32b31808SJens Wiklander uint8x16_t block = vld1q_u8(&input[0]); 158*32b31808SJens Wiklander unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset); 159*32b31808SJens Wiklander 160*32b31808SJens Wiklander if (mode == MBEDTLS_AES_ENCRYPT) { 161*32b31808SJens Wiklander block = aesce_encrypt_block(block, keys, ctx->nr); 162*32b31808SJens Wiklander } else { 163*32b31808SJens Wiklander block = aesce_decrypt_block(block, keys, ctx->nr); 164*32b31808SJens Wiklander } 165*32b31808SJens Wiklander vst1q_u8(&output[0], block); 166*32b31808SJens Wiklander 167*32b31808SJens Wiklander return 0; 168*32b31808SJens Wiklander } 169*32b31808SJens Wiklander 170*32b31808SJens Wiklander /* 171*32b31808SJens Wiklander * Compute decryption round keys from encryption round keys 172*32b31808SJens Wiklander */ 173*32b31808SJens Wiklander void mbedtls_aesce_inverse_key(unsigned char *invkey, 174*32b31808SJens Wiklander const unsigned char *fwdkey, 175*32b31808SJens Wiklander int nr) 176*32b31808SJens Wiklander { 177*32b31808SJens Wiklander int i, j; 178*32b31808SJens Wiklander j = nr; 179*32b31808SJens Wiklander vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16)); 180*32b31808SJens Wiklander for (i = 1, j--; j > 0; i++, j--) { 181*32b31808SJens Wiklander vst1q_u8(invkey + i * 16, 182*32b31808SJens Wiklander vaesimcq_u8(vld1q_u8(fwdkey + j * 16))); 183*32b31808SJens Wiklander } 184*32b31808SJens Wiklander vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16)); 185*32b31808SJens Wiklander 186*32b31808SJens Wiklander } 187*32b31808SJens Wiklander 188*32b31808SJens Wiklander static inline uint32_t aes_rot_word(uint32_t word) 189*32b31808SJens Wiklander { 190*32b31808SJens Wiklander return (word << (32 - 8)) | (word >> 8); 191*32b31808SJens Wiklander } 192*32b31808SJens Wiklander 193*32b31808SJens Wiklander static inline uint32_t aes_sub_word(uint32_t in) 194*32b31808SJens Wiklander { 195*32b31808SJens Wiklander uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in)); 196*32b31808SJens Wiklander uint8x16_t zero = vdupq_n_u8(0); 197*32b31808SJens Wiklander 198*32b31808SJens Wiklander /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields 199*32b31808SJens Wiklander * the correct result as ShiftRows doesn't change the first row. */ 200*32b31808SJens Wiklander v = vaeseq_u8(zero, v); 201*32b31808SJens Wiklander return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0); 202*32b31808SJens Wiklander } 203*32b31808SJens Wiklander 204*32b31808SJens Wiklander /* 205*32b31808SJens Wiklander * Key expansion function 206*32b31808SJens Wiklander */ 207*32b31808SJens Wiklander static void aesce_setkey_enc(unsigned char *rk, 208*32b31808SJens Wiklander const unsigned char *key, 209*32b31808SJens Wiklander const size_t key_bit_length) 210*32b31808SJens Wiklander { 211*32b31808SJens Wiklander static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 212*32b31808SJens Wiklander 0x20, 0x40, 0x80, 0x1b, 0x36 }; 213*32b31808SJens Wiklander /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf 214*32b31808SJens Wiklander * - Section 5, Nr = Nk + 6 215*32b31808SJens Wiklander * - Section 5.2, the length of round keys is Nb*(Nr+1) 216*32b31808SJens Wiklander */ 217*32b31808SJens Wiklander const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */ 218*32b31808SJens Wiklander const size_t round_key_len_in_words = 4; /* Nb */ 219*32b31808SJens Wiklander const size_t rounds_needed = key_len_in_words + 6; /* Nr */ 220*32b31808SJens Wiklander const size_t round_keys_len_in_words = 221*32b31808SJens Wiklander round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */ 222*32b31808SJens Wiklander const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words; 223*32b31808SJens Wiklander 224*32b31808SJens Wiklander memcpy(rk, key, key_len_in_words * 4); 225*32b31808SJens Wiklander 226*32b31808SJens Wiklander for (uint32_t *rki = (uint32_t *) rk; 227*32b31808SJens Wiklander rki + key_len_in_words < rko_end; 228*32b31808SJens Wiklander rki += key_len_in_words) { 229*32b31808SJens Wiklander 230*32b31808SJens Wiklander size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words; 231*32b31808SJens Wiklander uint32_t *rko; 232*32b31808SJens Wiklander rko = rki + key_len_in_words; 233*32b31808SJens Wiklander rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1])); 234*32b31808SJens Wiklander rko[0] ^= rcon[iteration] ^ rki[0]; 235*32b31808SJens Wiklander rko[1] = rko[0] ^ rki[1]; 236*32b31808SJens Wiklander rko[2] = rko[1] ^ rki[2]; 237*32b31808SJens Wiklander rko[3] = rko[2] ^ rki[3]; 238*32b31808SJens Wiklander if (rko + key_len_in_words > rko_end) { 239*32b31808SJens Wiklander /* Do not write overflow words.*/ 240*32b31808SJens Wiklander continue; 241*32b31808SJens Wiklander } 242*32b31808SJens Wiklander switch (key_bit_length) { 243*32b31808SJens Wiklander case 128: 244*32b31808SJens Wiklander break; 245*32b31808SJens Wiklander case 192: 246*32b31808SJens Wiklander rko[4] = rko[3] ^ rki[4]; 247*32b31808SJens Wiklander rko[5] = rko[4] ^ rki[5]; 248*32b31808SJens Wiklander break; 249*32b31808SJens Wiklander case 256: 250*32b31808SJens Wiklander rko[4] = aes_sub_word(rko[3]) ^ rki[4]; 251*32b31808SJens Wiklander rko[5] = rko[4] ^ rki[5]; 252*32b31808SJens Wiklander rko[6] = rko[5] ^ rki[6]; 253*32b31808SJens Wiklander rko[7] = rko[6] ^ rki[7]; 254*32b31808SJens Wiklander break; 255*32b31808SJens Wiklander } 256*32b31808SJens Wiklander } 257*32b31808SJens Wiklander } 258*32b31808SJens Wiklander 259*32b31808SJens Wiklander /* 260*32b31808SJens Wiklander * Key expansion, wrapper 261*32b31808SJens Wiklander */ 262*32b31808SJens Wiklander int mbedtls_aesce_setkey_enc(unsigned char *rk, 263*32b31808SJens Wiklander const unsigned char *key, 264*32b31808SJens Wiklander size_t bits) 265*32b31808SJens Wiklander { 266*32b31808SJens Wiklander switch (bits) { 267*32b31808SJens Wiklander case 128: 268*32b31808SJens Wiklander case 192: 269*32b31808SJens Wiklander case 256: 270*32b31808SJens Wiklander aesce_setkey_enc(rk, key, bits); 271*32b31808SJens Wiklander break; 272*32b31808SJens Wiklander default: 273*32b31808SJens Wiklander return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH; 274*32b31808SJens Wiklander } 275*32b31808SJens Wiklander 276*32b31808SJens Wiklander return 0; 277*32b31808SJens Wiklander } 278*32b31808SJens Wiklander 279*32b31808SJens Wiklander #if defined(MBEDTLS_GCM_C) 280*32b31808SJens Wiklander 281*32b31808SJens Wiklander #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5 282*32b31808SJens Wiklander /* Some intrinsics are not available for GCC 5.X. */ 283*32b31808SJens Wiklander #define vreinterpretq_p64_u8(a) ((poly64x2_t) a) 284*32b31808SJens Wiklander #define vreinterpretq_u8_p128(a) ((uint8x16_t) a) 285*32b31808SJens Wiklander static inline poly64_t vget_low_p64(poly64x2_t __a) 286*32b31808SJens Wiklander { 287*32b31808SJens Wiklander uint64x2_t tmp = (uint64x2_t) (__a); 288*32b31808SJens Wiklander uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0)); 289*32b31808SJens Wiklander return (poly64_t) (lo); 290*32b31808SJens Wiklander } 291*32b31808SJens Wiklander #endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/ 292*32b31808SJens Wiklander 293*32b31808SJens Wiklander /* vmull_p64/vmull_high_p64 wrappers. 294*32b31808SJens Wiklander * 295*32b31808SJens Wiklander * Older compilers miss some intrinsic functions for `poly*_t`. We use 296*32b31808SJens Wiklander * uint8x16_t and uint8x16x3_t as input/output parameters. 297*32b31808SJens Wiklander */ 298*32b31808SJens Wiklander static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b) 299*32b31808SJens Wiklander { 300*32b31808SJens Wiklander return vreinterpretq_u8_p128( 301*32b31808SJens Wiklander vmull_p64( 302*32b31808SJens Wiklander (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)), 303*32b31808SJens Wiklander (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b)))); 304*32b31808SJens Wiklander } 305*32b31808SJens Wiklander 306*32b31808SJens Wiklander static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b) 307*32b31808SJens Wiklander { 308*32b31808SJens Wiklander return vreinterpretq_u8_p128( 309*32b31808SJens Wiklander vmull_high_p64(vreinterpretq_p64_u8(a), 310*32b31808SJens Wiklander vreinterpretq_p64_u8(b))); 311*32b31808SJens Wiklander } 312*32b31808SJens Wiklander 313*32b31808SJens Wiklander /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by 314*32b31808SJens Wiklander * `x^128 + x^7 + x^2 + x + 1`. 315*32b31808SJens Wiklander * 316*32b31808SJens Wiklander * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b 317*32b31808SJens Wiklander * multiplies to generate a 128b. 318*32b31808SJens Wiklander * 319*32b31808SJens Wiklander * `poly_mult_128` executes polynomial multiplication and outputs 256b that 320*32b31808SJens Wiklander * represented by 3 128b due to code size optimization. 321*32b31808SJens Wiklander * 322*32b31808SJens Wiklander * Output layout: 323*32b31808SJens Wiklander * | | | | 324*32b31808SJens Wiklander * |------------|-------------|-------------| 325*32b31808SJens Wiklander * | ret.val[0] | h3:h2:00:00 | high 128b | 326*32b31808SJens Wiklander * | ret.val[1] | :m2:m1:00 | middle 128b | 327*32b31808SJens Wiklander * | ret.val[2] | : :l1:l0 | low 128b | 328*32b31808SJens Wiklander */ 329*32b31808SJens Wiklander static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b) 330*32b31808SJens Wiklander { 331*32b31808SJens Wiklander uint8x16x3_t ret; 332*32b31808SJens Wiklander uint8x16_t h, m, l; /* retval high/middle/low */ 333*32b31808SJens Wiklander uint8x16_t c, d, e; 334*32b31808SJens Wiklander 335*32b31808SJens Wiklander h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */ 336*32b31808SJens Wiklander l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */ 337*32b31808SJens Wiklander c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */ 338*32b31808SJens Wiklander d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */ 339*32b31808SJens Wiklander e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */ 340*32b31808SJens Wiklander m = veorq_u8(d, e); /* :m2:m1:00 = d + e */ 341*32b31808SJens Wiklander 342*32b31808SJens Wiklander ret.val[0] = h; 343*32b31808SJens Wiklander ret.val[1] = m; 344*32b31808SJens Wiklander ret.val[2] = l; 345*32b31808SJens Wiklander return ret; 346*32b31808SJens Wiklander } 347*32b31808SJens Wiklander 348*32b31808SJens Wiklander /* 349*32b31808SJens Wiklander * Modulo reduction. 350*32b31808SJens Wiklander * 351*32b31808SJens Wiklander * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8 352*32b31808SJens Wiklander * 353*32b31808SJens Wiklander * Section 4.3 354*32b31808SJens Wiklander * 355*32b31808SJens Wiklander * Modular reduction is slightly more complex. Write the GCM modulus as f(z) = 356*32b31808SJens Wiklander * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to 357*32b31808SJens Wiklander * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit 358*32b31808SJens Wiklander * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we 359*32b31808SJens Wiklander * simply multiply the higher part of the operand by r(z) and add it to l(z). If 360*32b31808SJens Wiklander * the result is still larger than 128 bits, we reduce again. 361*32b31808SJens Wiklander */ 362*32b31808SJens Wiklander static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input) 363*32b31808SJens Wiklander { 364*32b31808SJens Wiklander uint8x16_t const ZERO = vdupq_n_u8(0); 365*32b31808SJens Wiklander /* use 'asm' as an optimisation barrier to prevent loading MODULO from memory */ 366*32b31808SJens Wiklander uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87)); 367*32b31808SJens Wiklander asm ("" : "+w" (r)); 368*32b31808SJens Wiklander uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8)); 369*32b31808SJens Wiklander uint8x16_t h, m, l; /* input high/middle/low 128b */ 370*32b31808SJens Wiklander uint8x16_t c, d, e, f, g, n, o; 371*32b31808SJens Wiklander h = input.val[0]; /* h3:h2:00:00 */ 372*32b31808SJens Wiklander m = input.val[1]; /* :m2:m1:00 */ 373*32b31808SJens Wiklander l = input.val[2]; /* : :l1:l0 */ 374*32b31808SJens Wiklander c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */ 375*32b31808SJens Wiklander d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */ 376*32b31808SJens Wiklander e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */ 377*32b31808SJens Wiklander f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */ 378*32b31808SJens Wiklander g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */ 379*32b31808SJens Wiklander n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */ 380*32b31808SJens Wiklander o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */ 381*32b31808SJens Wiklander return veorq_u8(o, g); /* = o1:o0 + g1:00 */ 382*32b31808SJens Wiklander } 383*32b31808SJens Wiklander 384*32b31808SJens Wiklander /* 385*32b31808SJens Wiklander * GCM multiplication: c = a times b in GF(2^128) 386*32b31808SJens Wiklander */ 387*32b31808SJens Wiklander void mbedtls_aesce_gcm_mult(unsigned char c[16], 388*32b31808SJens Wiklander const unsigned char a[16], 389*32b31808SJens Wiklander const unsigned char b[16]) 390*32b31808SJens Wiklander { 391*32b31808SJens Wiklander uint8x16_t va, vb, vc; 392*32b31808SJens Wiklander va = vrbitq_u8(vld1q_u8(&a[0])); 393*32b31808SJens Wiklander vb = vrbitq_u8(vld1q_u8(&b[0])); 394*32b31808SJens Wiklander vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb))); 395*32b31808SJens Wiklander vst1q_u8(&c[0], vc); 396*32b31808SJens Wiklander } 397*32b31808SJens Wiklander 398*32b31808SJens Wiklander #endif /* MBEDTLS_GCM_C */ 399*32b31808SJens Wiklander 400*32b31808SJens Wiklander #if defined(MBEDTLS_POP_TARGET_PRAGMA) 401*32b31808SJens Wiklander #if defined(__clang__) 402*32b31808SJens Wiklander #pragma clang attribute pop 403*32b31808SJens Wiklander #elif defined(__GNUC__) 404*32b31808SJens Wiklander #pragma GCC pop_options 405*32b31808SJens Wiklander #endif 406*32b31808SJens Wiklander #undef MBEDTLS_POP_TARGET_PRAGMA 407*32b31808SJens Wiklander #endif 408*32b31808SJens Wiklander 409*32b31808SJens Wiklander #endif /* MBEDTLS_HAVE_ARM64 */ 410*32b31808SJens Wiklander 411*32b31808SJens Wiklander #endif /* MBEDTLS_AESCE_C */ 412