1817466cbSJens Wiklander /* 2817466cbSJens Wiklander * AES-NI support functions 3817466cbSJens Wiklander * 47901324dSJerome Forissier * Copyright The Mbed TLS Contributors 5b0563631STom Van Eyck * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 6817466cbSJens Wiklander */ 7817466cbSJens Wiklander 8817466cbSJens Wiklander /* 932b31808SJens Wiklander * [AES-WP] https://www.intel.com/content/www/us/en/developer/articles/tool/intel-advanced-encryption-standard-aes-instructions-set.html 1032b31808SJens Wiklander * [CLMUL-WP] https://www.intel.com/content/www/us/en/develop/download/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode.html 11817466cbSJens Wiklander */ 12817466cbSJens Wiklander 137901324dSJerome Forissier #include "common.h" 14817466cbSJens Wiklander 15817466cbSJens Wiklander #if defined(MBEDTLS_AESNI_C) 16817466cbSJens Wiklander 1732b31808SJens Wiklander #include "aesni.h" 18817466cbSJens Wiklander 19817466cbSJens Wiklander #include <string.h> 20817466cbSJens Wiklander 2132b31808SJens Wiklander #if defined(MBEDTLS_AESNI_HAVE_CODE) 22817466cbSJens Wiklander 2332b31808SJens Wiklander #if MBEDTLS_AESNI_HAVE_CODE == 2 24b0563631STom Van Eyck #if defined(__GNUC__) 2532b31808SJens Wiklander #include <cpuid.h> 26b0563631STom Van Eyck #elif defined(_MSC_VER) 27b0563631STom Van Eyck #include <intrin.h> 28b0563631STom Van Eyck #else 29b0563631STom Van Eyck #error "`__cpuid` required by MBEDTLS_AESNI_C is not supported by the compiler" 3032b31808SJens Wiklander #endif 3132b31808SJens Wiklander #include <immintrin.h> 3232b31808SJens Wiklander #endif 33817466cbSJens Wiklander 34b0563631STom Van Eyck #if defined(MBEDTLS_ARCH_IS_X86) 35b0563631STom Van Eyck #if defined(MBEDTLS_COMPILER_IS_GCC) 36b0563631STom Van Eyck #pragma GCC push_options 37b0563631STom Van Eyck #pragma GCC target ("pclmul,sse2,aes") 38b0563631STom Van Eyck #define MBEDTLS_POP_TARGET_PRAGMA 39b0563631STom Van Eyck #elif defined(__clang__) && (__clang_major__ >= 5) 40b0563631STom Van Eyck #pragma clang attribute push (__attribute__((target("pclmul,sse2,aes"))), apply_to=function) 41b0563631STom Van Eyck #define MBEDTLS_POP_TARGET_PRAGMA 42b0563631STom Van Eyck #endif 43b0563631STom Van Eyck #endif 44b0563631STom Van Eyck 45b0563631STom Van Eyck #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) 46817466cbSJens Wiklander /* 47817466cbSJens Wiklander * AES-NI support detection routine 48817466cbSJens Wiklander */ 49817466cbSJens Wiklander int mbedtls_aesni_has_support(unsigned int what) 50817466cbSJens Wiklander { 51817466cbSJens Wiklander static int done = 0; 52817466cbSJens Wiklander static unsigned int c = 0; 53817466cbSJens Wiklander 5432b31808SJens Wiklander if (!done) { 5532b31808SJens Wiklander #if MBEDTLS_AESNI_HAVE_CODE == 2 56b0563631STom Van Eyck static int info[4] = { 0, 0, 0, 0 }; 5732b31808SJens Wiklander #if defined(_MSC_VER) 5832b31808SJens Wiklander __cpuid(info, 1); 5932b31808SJens Wiklander #else 6032b31808SJens Wiklander __cpuid(1, info[0], info[1], info[2], info[3]); 6132b31808SJens Wiklander #endif 6232b31808SJens Wiklander c = info[2]; 6332b31808SJens Wiklander #else /* AESNI using asm */ 64817466cbSJens Wiklander asm ("movl $1, %%eax \n\t" 65817466cbSJens Wiklander "cpuid \n\t" 66817466cbSJens Wiklander : "=c" (c) 67817466cbSJens Wiklander : 68817466cbSJens Wiklander : "eax", "ebx", "edx"); 6932b31808SJens Wiklander #endif /* MBEDTLS_AESNI_HAVE_CODE */ 70817466cbSJens Wiklander done = 1; 71817466cbSJens Wiklander } 72817466cbSJens Wiklander 7332b31808SJens Wiklander return (c & what) != 0; 74817466cbSJens Wiklander } 75b0563631STom Van Eyck #endif /* !MBEDTLS_AES_USE_HARDWARE_ONLY */ 76817466cbSJens Wiklander 7732b31808SJens Wiklander #if MBEDTLS_AESNI_HAVE_CODE == 2 7832b31808SJens Wiklander 7932b31808SJens Wiklander /* 8032b31808SJens Wiklander * AES-NI AES-ECB block en(de)cryption 8132b31808SJens Wiklander */ 8232b31808SJens Wiklander int mbedtls_aesni_crypt_ecb(mbedtls_aes_context *ctx, 8332b31808SJens Wiklander int mode, 8432b31808SJens Wiklander const unsigned char input[16], 8532b31808SJens Wiklander unsigned char output[16]) 8632b31808SJens Wiklander { 8732b31808SJens Wiklander const __m128i *rk = (const __m128i *) (ctx->buf + ctx->rk_offset); 8832b31808SJens Wiklander unsigned nr = ctx->nr; // Number of remaining rounds 8932b31808SJens Wiklander 9032b31808SJens Wiklander // Load round key 0 9132b31808SJens Wiklander __m128i state; 9232b31808SJens Wiklander memcpy(&state, input, 16); 9332b31808SJens Wiklander state = _mm_xor_si128(state, rk[0]); // state ^= *rk; 9432b31808SJens Wiklander ++rk; 9532b31808SJens Wiklander --nr; 9632b31808SJens Wiklander 97b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 98b0563631STom Van Eyck if (mode == MBEDTLS_AES_DECRYPT) { 9932b31808SJens Wiklander while (nr != 0) { 10032b31808SJens Wiklander state = _mm_aesdec_si128(state, *rk); 10132b31808SJens Wiklander ++rk; 10232b31808SJens Wiklander --nr; 10332b31808SJens Wiklander } 10432b31808SJens Wiklander state = _mm_aesdeclast_si128(state, *rk); 105b0563631STom Van Eyck } else 106b0563631STom Van Eyck #else 107b0563631STom Van Eyck (void) mode; 108b0563631STom Van Eyck #endif 109b0563631STom Van Eyck { 11032b31808SJens Wiklander while (nr != 0) { 11132b31808SJens Wiklander state = _mm_aesenc_si128(state, *rk); 11232b31808SJens Wiklander ++rk; 11332b31808SJens Wiklander --nr; 11432b31808SJens Wiklander } 11532b31808SJens Wiklander state = _mm_aesenclast_si128(state, *rk); 11632b31808SJens Wiklander } 11732b31808SJens Wiklander 11832b31808SJens Wiklander memcpy(output, &state, 16); 11932b31808SJens Wiklander return 0; 12032b31808SJens Wiklander } 12132b31808SJens Wiklander 12232b31808SJens Wiklander /* 12332b31808SJens Wiklander * GCM multiplication: c = a times b in GF(2^128) 12432b31808SJens Wiklander * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5. 12532b31808SJens Wiklander */ 12632b31808SJens Wiklander 12732b31808SJens Wiklander static void gcm_clmul(const __m128i aa, const __m128i bb, 12832b31808SJens Wiklander __m128i *cc, __m128i *dd) 12932b31808SJens Wiklander { 13032b31808SJens Wiklander /* 13132b31808SJens Wiklander * Caryless multiplication dd:cc = aa * bb 13232b31808SJens Wiklander * using [CLMUL-WP] algorithm 1 (p. 12). 13332b31808SJens Wiklander */ 13432b31808SJens Wiklander *cc = _mm_clmulepi64_si128(aa, bb, 0x00); // a0*b0 = c1:c0 13532b31808SJens Wiklander *dd = _mm_clmulepi64_si128(aa, bb, 0x11); // a1*b1 = d1:d0 13632b31808SJens Wiklander __m128i ee = _mm_clmulepi64_si128(aa, bb, 0x10); // a0*b1 = e1:e0 13732b31808SJens Wiklander __m128i ff = _mm_clmulepi64_si128(aa, bb, 0x01); // a1*b0 = f1:f0 13832b31808SJens Wiklander ff = _mm_xor_si128(ff, ee); // e1+f1:e0+f0 13932b31808SJens Wiklander ee = ff; // e1+f1:e0+f0 14032b31808SJens Wiklander ff = _mm_srli_si128(ff, 8); // 0:e1+f1 14132b31808SJens Wiklander ee = _mm_slli_si128(ee, 8); // e0+f0:0 14232b31808SJens Wiklander *dd = _mm_xor_si128(*dd, ff); // d1:d0+e1+f1 14332b31808SJens Wiklander *cc = _mm_xor_si128(*cc, ee); // c1+e0+f0:c0 14432b31808SJens Wiklander } 14532b31808SJens Wiklander 14632b31808SJens Wiklander static void gcm_shift(__m128i *cc, __m128i *dd) 14732b31808SJens Wiklander { 14832b31808SJens Wiklander /* [CMUCL-WP] Algorithm 5 Step 1: shift cc:dd one bit to the left, 14932b31808SJens Wiklander * taking advantage of [CLMUL-WP] eq 27 (p. 18). */ 15032b31808SJens Wiklander // // *cc = r1:r0 15132b31808SJens Wiklander // // *dd = r3:r2 15232b31808SJens Wiklander __m128i cc_lo = _mm_slli_epi64(*cc, 1); // r1<<1:r0<<1 15332b31808SJens Wiklander __m128i dd_lo = _mm_slli_epi64(*dd, 1); // r3<<1:r2<<1 15432b31808SJens Wiklander __m128i cc_hi = _mm_srli_epi64(*cc, 63); // r1>>63:r0>>63 15532b31808SJens Wiklander __m128i dd_hi = _mm_srli_epi64(*dd, 63); // r3>>63:r2>>63 15632b31808SJens Wiklander __m128i xmm5 = _mm_srli_si128(cc_hi, 8); // 0:r1>>63 15732b31808SJens Wiklander cc_hi = _mm_slli_si128(cc_hi, 8); // r0>>63:0 15832b31808SJens Wiklander dd_hi = _mm_slli_si128(dd_hi, 8); // 0:r1>>63 15932b31808SJens Wiklander 16032b31808SJens Wiklander *cc = _mm_or_si128(cc_lo, cc_hi); // r1<<1|r0>>63:r0<<1 16132b31808SJens Wiklander *dd = _mm_or_si128(_mm_or_si128(dd_lo, dd_hi), xmm5); // r3<<1|r2>>62:r2<<1|r1>>63 16232b31808SJens Wiklander } 16332b31808SJens Wiklander 16432b31808SJens Wiklander static __m128i gcm_reduce(__m128i xx) 16532b31808SJens Wiklander { 16632b31808SJens Wiklander // // xx = x1:x0 16732b31808SJens Wiklander /* [CLMUL-WP] Algorithm 5 Step 2 */ 16832b31808SJens Wiklander __m128i aa = _mm_slli_epi64(xx, 63); // x1<<63:x0<<63 = stuff:a 16932b31808SJens Wiklander __m128i bb = _mm_slli_epi64(xx, 62); // x1<<62:x0<<62 = stuff:b 17032b31808SJens Wiklander __m128i cc = _mm_slli_epi64(xx, 57); // x1<<57:x0<<57 = stuff:c 17132b31808SJens Wiklander __m128i dd = _mm_slli_si128(_mm_xor_si128(_mm_xor_si128(aa, bb), cc), 8); // a+b+c:0 17232b31808SJens Wiklander return _mm_xor_si128(dd, xx); // x1+a+b+c:x0 = d:x0 17332b31808SJens Wiklander } 17432b31808SJens Wiklander 17532b31808SJens Wiklander static __m128i gcm_mix(__m128i dx) 17632b31808SJens Wiklander { 17732b31808SJens Wiklander /* [CLMUL-WP] Algorithm 5 Steps 3 and 4 */ 17832b31808SJens Wiklander __m128i ee = _mm_srli_epi64(dx, 1); // e1:x0>>1 = e1:e0' 17932b31808SJens Wiklander __m128i ff = _mm_srli_epi64(dx, 2); // f1:x0>>2 = f1:f0' 18032b31808SJens Wiklander __m128i gg = _mm_srli_epi64(dx, 7); // g1:x0>>7 = g1:g0' 18132b31808SJens Wiklander 18232b31808SJens Wiklander // e0'+f0'+g0' is almost e0+f0+g0, except for some missing 18332b31808SJens Wiklander // bits carried from d. Now get those bits back in. 18432b31808SJens Wiklander __m128i eh = _mm_slli_epi64(dx, 63); // d<<63:stuff 18532b31808SJens Wiklander __m128i fh = _mm_slli_epi64(dx, 62); // d<<62:stuff 18632b31808SJens Wiklander __m128i gh = _mm_slli_epi64(dx, 57); // d<<57:stuff 18732b31808SJens Wiklander __m128i hh = _mm_srli_si128(_mm_xor_si128(_mm_xor_si128(eh, fh), gh), 8); // 0:missing bits of d 18832b31808SJens Wiklander 18932b31808SJens Wiklander return _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(ee, ff), gg), hh), dx); 19032b31808SJens Wiklander } 19132b31808SJens Wiklander 19232b31808SJens Wiklander void mbedtls_aesni_gcm_mult(unsigned char c[16], 19332b31808SJens Wiklander const unsigned char a[16], 19432b31808SJens Wiklander const unsigned char b[16]) 19532b31808SJens Wiklander { 196b0563631STom Van Eyck __m128i aa = { 0 }, bb = { 0 }, cc, dd; 19732b31808SJens Wiklander 19832b31808SJens Wiklander /* The inputs are in big-endian order, so byte-reverse them */ 19932b31808SJens Wiklander for (size_t i = 0; i < 16; i++) { 20032b31808SJens Wiklander ((uint8_t *) &aa)[i] = a[15 - i]; 20132b31808SJens Wiklander ((uint8_t *) &bb)[i] = b[15 - i]; 20232b31808SJens Wiklander } 20332b31808SJens Wiklander 20432b31808SJens Wiklander gcm_clmul(aa, bb, &cc, &dd); 20532b31808SJens Wiklander gcm_shift(&cc, &dd); 20632b31808SJens Wiklander /* 20732b31808SJens Wiklander * Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1 20832b31808SJens Wiklander * using [CLMUL-WP] algorithm 5 (p. 18). 20932b31808SJens Wiklander * Currently dd:cc holds x3:x2:x1:x0 (already shifted). 21032b31808SJens Wiklander */ 21132b31808SJens Wiklander __m128i dx = gcm_reduce(cc); 21232b31808SJens Wiklander __m128i xh = gcm_mix(dx); 21332b31808SJens Wiklander cc = _mm_xor_si128(xh, dd); // x3+h1:x2+h0 21432b31808SJens Wiklander 21532b31808SJens Wiklander /* Now byte-reverse the outputs */ 21632b31808SJens Wiklander for (size_t i = 0; i < 16; i++) { 21732b31808SJens Wiklander c[i] = ((uint8_t *) &cc)[15 - i]; 21832b31808SJens Wiklander } 21932b31808SJens Wiklander 22032b31808SJens Wiklander return; 22132b31808SJens Wiklander } 22232b31808SJens Wiklander 22332b31808SJens Wiklander /* 22432b31808SJens Wiklander * Compute decryption round keys from encryption round keys 22532b31808SJens Wiklander */ 226b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 22732b31808SJens Wiklander void mbedtls_aesni_inverse_key(unsigned char *invkey, 22832b31808SJens Wiklander const unsigned char *fwdkey, int nr) 22932b31808SJens Wiklander { 23032b31808SJens Wiklander __m128i *ik = (__m128i *) invkey; 23132b31808SJens Wiklander const __m128i *fk = (const __m128i *) fwdkey + nr; 23232b31808SJens Wiklander 23332b31808SJens Wiklander *ik = *fk; 23432b31808SJens Wiklander for (--fk, ++ik; fk > (const __m128i *) fwdkey; --fk, ++ik) { 23532b31808SJens Wiklander *ik = _mm_aesimc_si128(*fk); 23632b31808SJens Wiklander } 23732b31808SJens Wiklander *ik = *fk; 23832b31808SJens Wiklander } 239b0563631STom Van Eyck #endif 24032b31808SJens Wiklander 24132b31808SJens Wiklander /* 24232b31808SJens Wiklander * Key expansion, 128-bit case 24332b31808SJens Wiklander */ 24432b31808SJens Wiklander static __m128i aesni_set_rk_128(__m128i state, __m128i xword) 24532b31808SJens Wiklander { 24632b31808SJens Wiklander /* 24732b31808SJens Wiklander * Finish generating the next round key. 24832b31808SJens Wiklander * 24932b31808SJens Wiklander * On entry state is r3:r2:r1:r0 and xword is X:stuff:stuff:stuff 25032b31808SJens Wiklander * with X = rot( sub( r3 ) ) ^ RCON (obtained with AESKEYGENASSIST). 25132b31808SJens Wiklander * 25232b31808SJens Wiklander * On exit, xword is r7:r6:r5:r4 25332b31808SJens Wiklander * with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3 25432b31808SJens Wiklander * and this is returned, to be written to the round key buffer. 25532b31808SJens Wiklander */ 25632b31808SJens Wiklander xword = _mm_shuffle_epi32(xword, 0xff); // X:X:X:X 25732b31808SJens Wiklander xword = _mm_xor_si128(xword, state); // X+r3:X+r2:X+r1:r4 25832b31808SJens Wiklander state = _mm_slli_si128(state, 4); // r2:r1:r0:0 25932b31808SJens Wiklander xword = _mm_xor_si128(xword, state); // X+r3+r2:X+r2+r1:r5:r4 26032b31808SJens Wiklander state = _mm_slli_si128(state, 4); // r1:r0:0:0 26132b31808SJens Wiklander xword = _mm_xor_si128(xword, state); // X+r3+r2+r1:r6:r5:r4 26232b31808SJens Wiklander state = _mm_slli_si128(state, 4); // r0:0:0:0 26332b31808SJens Wiklander state = _mm_xor_si128(xword, state); // r7:r6:r5:r4 26432b31808SJens Wiklander return state; 26532b31808SJens Wiklander } 26632b31808SJens Wiklander 26732b31808SJens Wiklander static void aesni_setkey_enc_128(unsigned char *rk_bytes, 26832b31808SJens Wiklander const unsigned char *key) 26932b31808SJens Wiklander { 27032b31808SJens Wiklander __m128i *rk = (__m128i *) rk_bytes; 27132b31808SJens Wiklander 27232b31808SJens Wiklander memcpy(&rk[0], key, 16); 27332b31808SJens Wiklander rk[1] = aesni_set_rk_128(rk[0], _mm_aeskeygenassist_si128(rk[0], 0x01)); 27432b31808SJens Wiklander rk[2] = aesni_set_rk_128(rk[1], _mm_aeskeygenassist_si128(rk[1], 0x02)); 27532b31808SJens Wiklander rk[3] = aesni_set_rk_128(rk[2], _mm_aeskeygenassist_si128(rk[2], 0x04)); 27632b31808SJens Wiklander rk[4] = aesni_set_rk_128(rk[3], _mm_aeskeygenassist_si128(rk[3], 0x08)); 27732b31808SJens Wiklander rk[5] = aesni_set_rk_128(rk[4], _mm_aeskeygenassist_si128(rk[4], 0x10)); 27832b31808SJens Wiklander rk[6] = aesni_set_rk_128(rk[5], _mm_aeskeygenassist_si128(rk[5], 0x20)); 27932b31808SJens Wiklander rk[7] = aesni_set_rk_128(rk[6], _mm_aeskeygenassist_si128(rk[6], 0x40)); 28032b31808SJens Wiklander rk[8] = aesni_set_rk_128(rk[7], _mm_aeskeygenassist_si128(rk[7], 0x80)); 28132b31808SJens Wiklander rk[9] = aesni_set_rk_128(rk[8], _mm_aeskeygenassist_si128(rk[8], 0x1B)); 28232b31808SJens Wiklander rk[10] = aesni_set_rk_128(rk[9], _mm_aeskeygenassist_si128(rk[9], 0x36)); 28332b31808SJens Wiklander } 28432b31808SJens Wiklander 28532b31808SJens Wiklander /* 28632b31808SJens Wiklander * Key expansion, 192-bit case 28732b31808SJens Wiklander */ 288b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH) 28932b31808SJens Wiklander static void aesni_set_rk_192(__m128i *state0, __m128i *state1, __m128i xword, 29032b31808SJens Wiklander unsigned char *rk) 29132b31808SJens Wiklander { 29232b31808SJens Wiklander /* 29332b31808SJens Wiklander * Finish generating the next 6 quarter-keys. 29432b31808SJens Wiklander * 29532b31808SJens Wiklander * On entry state0 is r3:r2:r1:r0, state1 is stuff:stuff:r5:r4 29632b31808SJens Wiklander * and xword is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON 29732b31808SJens Wiklander * (obtained with AESKEYGENASSIST). 29832b31808SJens Wiklander * 29932b31808SJens Wiklander * On exit, state0 is r9:r8:r7:r6 and state1 is stuff:stuff:r11:r10 30032b31808SJens Wiklander * and those are written to the round key buffer. 30132b31808SJens Wiklander */ 30232b31808SJens Wiklander xword = _mm_shuffle_epi32(xword, 0x55); // X:X:X:X 30332b31808SJens Wiklander xword = _mm_xor_si128(xword, *state0); // X+r3:X+r2:X+r1:X+r0 30432b31808SJens Wiklander *state0 = _mm_slli_si128(*state0, 4); // r2:r1:r0:0 30532b31808SJens Wiklander xword = _mm_xor_si128(xword, *state0); // X+r3+r2:X+r2+r1:X+r1+r0:X+r0 30632b31808SJens Wiklander *state0 = _mm_slli_si128(*state0, 4); // r1:r0:0:0 30732b31808SJens Wiklander xword = _mm_xor_si128(xword, *state0); // X+r3+r2+r1:X+r2+r1+r0:X+r1+r0:X+r0 30832b31808SJens Wiklander *state0 = _mm_slli_si128(*state0, 4); // r0:0:0:0 30932b31808SJens Wiklander xword = _mm_xor_si128(xword, *state0); // X+r3+r2+r1+r0:X+r2+r1+r0:X+r1+r0:X+r0 31032b31808SJens Wiklander *state0 = xword; // = r9:r8:r7:r6 31132b31808SJens Wiklander 31232b31808SJens Wiklander xword = _mm_shuffle_epi32(xword, 0xff); // r9:r9:r9:r9 31332b31808SJens Wiklander xword = _mm_xor_si128(xword, *state1); // stuff:stuff:r9+r5:r9+r4 31432b31808SJens Wiklander *state1 = _mm_slli_si128(*state1, 4); // stuff:stuff:r4:0 31532b31808SJens Wiklander xword = _mm_xor_si128(xword, *state1); // stuff:stuff:r9+r5+r4:r9+r4 31632b31808SJens Wiklander *state1 = xword; // = stuff:stuff:r11:r10 31732b31808SJens Wiklander 31832b31808SJens Wiklander /* Store state0 and the low half of state1 into rk, which is conceptually 31932b31808SJens Wiklander * an array of 24-byte elements. Since 24 is not a multiple of 16, 32032b31808SJens Wiklander * rk is not necessarily aligned so just `*rk = *state0` doesn't work. */ 32132b31808SJens Wiklander memcpy(rk, state0, 16); 32232b31808SJens Wiklander memcpy(rk + 16, state1, 8); 32332b31808SJens Wiklander } 32432b31808SJens Wiklander 32532b31808SJens Wiklander static void aesni_setkey_enc_192(unsigned char *rk, 32632b31808SJens Wiklander const unsigned char *key) 32732b31808SJens Wiklander { 32832b31808SJens Wiklander /* First round: use original key */ 32932b31808SJens Wiklander memcpy(rk, key, 24); 33032b31808SJens Wiklander /* aes.c guarantees that rk is aligned on a 16-byte boundary. */ 33132b31808SJens Wiklander __m128i state0 = ((__m128i *) rk)[0]; 33232b31808SJens Wiklander __m128i state1 = _mm_loadl_epi64(((__m128i *) rk) + 1); 33332b31808SJens Wiklander 33432b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x01), rk + 24 * 1); 33532b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x02), rk + 24 * 2); 33632b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x04), rk + 24 * 3); 33732b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x08), rk + 24 * 4); 33832b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x10), rk + 24 * 5); 33932b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x20), rk + 24 * 6); 34032b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x40), rk + 24 * 7); 34132b31808SJens Wiklander aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, 0x80), rk + 24 * 8); 34232b31808SJens Wiklander } 343b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */ 34432b31808SJens Wiklander 34532b31808SJens Wiklander /* 34632b31808SJens Wiklander * Key expansion, 256-bit case 34732b31808SJens Wiklander */ 348b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH) 34932b31808SJens Wiklander static void aesni_set_rk_256(__m128i state0, __m128i state1, __m128i xword, 35032b31808SJens Wiklander __m128i *rk0, __m128i *rk1) 35132b31808SJens Wiklander { 35232b31808SJens Wiklander /* 35332b31808SJens Wiklander * Finish generating the next two round keys. 35432b31808SJens Wiklander * 35532b31808SJens Wiklander * On entry state0 is r3:r2:r1:r0, state1 is r7:r6:r5:r4 and 35632b31808SJens Wiklander * xword is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON 35732b31808SJens Wiklander * (obtained with AESKEYGENASSIST). 35832b31808SJens Wiklander * 35932b31808SJens Wiklander * On exit, *rk0 is r11:r10:r9:r8 and *rk1 is r15:r14:r13:r12 36032b31808SJens Wiklander */ 36132b31808SJens Wiklander xword = _mm_shuffle_epi32(xword, 0xff); 36232b31808SJens Wiklander xword = _mm_xor_si128(xword, state0); 36332b31808SJens Wiklander state0 = _mm_slli_si128(state0, 4); 36432b31808SJens Wiklander xword = _mm_xor_si128(xword, state0); 36532b31808SJens Wiklander state0 = _mm_slli_si128(state0, 4); 36632b31808SJens Wiklander xword = _mm_xor_si128(xword, state0); 36732b31808SJens Wiklander state0 = _mm_slli_si128(state0, 4); 36832b31808SJens Wiklander state0 = _mm_xor_si128(state0, xword); 36932b31808SJens Wiklander *rk0 = state0; 37032b31808SJens Wiklander 37132b31808SJens Wiklander /* Set xword to stuff:Y:stuff:stuff with Y = subword( r11 ) 37232b31808SJens Wiklander * and proceed to generate next round key from there */ 37332b31808SJens Wiklander xword = _mm_aeskeygenassist_si128(state0, 0x00); 37432b31808SJens Wiklander xword = _mm_shuffle_epi32(xword, 0xaa); 37532b31808SJens Wiklander xword = _mm_xor_si128(xword, state1); 37632b31808SJens Wiklander state1 = _mm_slli_si128(state1, 4); 37732b31808SJens Wiklander xword = _mm_xor_si128(xword, state1); 37832b31808SJens Wiklander state1 = _mm_slli_si128(state1, 4); 37932b31808SJens Wiklander xword = _mm_xor_si128(xword, state1); 38032b31808SJens Wiklander state1 = _mm_slli_si128(state1, 4); 38132b31808SJens Wiklander state1 = _mm_xor_si128(state1, xword); 38232b31808SJens Wiklander *rk1 = state1; 38332b31808SJens Wiklander } 38432b31808SJens Wiklander 38532b31808SJens Wiklander static void aesni_setkey_enc_256(unsigned char *rk_bytes, 38632b31808SJens Wiklander const unsigned char *key) 38732b31808SJens Wiklander { 38832b31808SJens Wiklander __m128i *rk = (__m128i *) rk_bytes; 38932b31808SJens Wiklander 39032b31808SJens Wiklander memcpy(&rk[0], key, 16); 39132b31808SJens Wiklander memcpy(&rk[1], key + 16, 16); 39232b31808SJens Wiklander 39332b31808SJens Wiklander /* 39432b31808SJens Wiklander * Main "loop" - Generating one more key than necessary, 39532b31808SJens Wiklander * see definition of mbedtls_aes_context.buf 39632b31808SJens Wiklander */ 39732b31808SJens Wiklander aesni_set_rk_256(rk[0], rk[1], _mm_aeskeygenassist_si128(rk[1], 0x01), &rk[2], &rk[3]); 39832b31808SJens Wiklander aesni_set_rk_256(rk[2], rk[3], _mm_aeskeygenassist_si128(rk[3], 0x02), &rk[4], &rk[5]); 39932b31808SJens Wiklander aesni_set_rk_256(rk[4], rk[5], _mm_aeskeygenassist_si128(rk[5], 0x04), &rk[6], &rk[7]); 40032b31808SJens Wiklander aesni_set_rk_256(rk[6], rk[7], _mm_aeskeygenassist_si128(rk[7], 0x08), &rk[8], &rk[9]); 40132b31808SJens Wiklander aesni_set_rk_256(rk[8], rk[9], _mm_aeskeygenassist_si128(rk[9], 0x10), &rk[10], &rk[11]); 40232b31808SJens Wiklander aesni_set_rk_256(rk[10], rk[11], _mm_aeskeygenassist_si128(rk[11], 0x20), &rk[12], &rk[13]); 40332b31808SJens Wiklander aesni_set_rk_256(rk[12], rk[13], _mm_aeskeygenassist_si128(rk[13], 0x40), &rk[14], &rk[15]); 40432b31808SJens Wiklander } 405b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */ 406b0563631STom Van Eyck 407b0563631STom Van Eyck #if defined(MBEDTLS_POP_TARGET_PRAGMA) 408b0563631STom Van Eyck #if defined(__clang__) 409b0563631STom Van Eyck #pragma clang attribute pop 410b0563631STom Van Eyck #elif defined(__GNUC__) 411b0563631STom Van Eyck #pragma GCC pop_options 412b0563631STom Van Eyck #endif 413b0563631STom Van Eyck #undef MBEDTLS_POP_TARGET_PRAGMA 414b0563631STom Van Eyck #endif 41532b31808SJens Wiklander 41632b31808SJens Wiklander #else /* MBEDTLS_AESNI_HAVE_CODE == 1 */ 41732b31808SJens Wiklander 41832b31808SJens Wiklander #if defined(__has_feature) 41932b31808SJens Wiklander #if __has_feature(memory_sanitizer) 42032b31808SJens Wiklander #warning \ 42132b31808SJens Wiklander "MBEDTLS_AESNI_C is known to cause spurious error reports with some memory sanitizers as they do not understand the assembly code." 42232b31808SJens Wiklander #endif 42332b31808SJens Wiklander #endif 42432b31808SJens Wiklander 425817466cbSJens Wiklander /* 426817466cbSJens Wiklander * Binutils needs to be at least 2.19 to support AES-NI instructions. 427817466cbSJens Wiklander * Unfortunately, a lot of users have a lower version now (2014-04). 428817466cbSJens Wiklander * Emit bytecode directly in order to support "old" version of gas. 429817466cbSJens Wiklander * 430817466cbSJens Wiklander * Opcodes from the Intel architecture reference manual, vol. 3. 431817466cbSJens Wiklander * We always use registers, so we don't need prefixes for memory operands. 432817466cbSJens Wiklander * Operand macros are in gas order (src, dst) as opposed to Intel order 433817466cbSJens Wiklander * (dst, src) in order to blend better into the surrounding assembly code. 434817466cbSJens Wiklander */ 43532b31808SJens Wiklander #define AESDEC(regs) ".byte 0x66,0x0F,0x38,0xDE," regs "\n\t" 43632b31808SJens Wiklander #define AESDECLAST(regs) ".byte 0x66,0x0F,0x38,0xDF," regs "\n\t" 43732b31808SJens Wiklander #define AESENC(regs) ".byte 0x66,0x0F,0x38,0xDC," regs "\n\t" 43832b31808SJens Wiklander #define AESENCLAST(regs) ".byte 0x66,0x0F,0x38,0xDD," regs "\n\t" 43932b31808SJens Wiklander #define AESIMC(regs) ".byte 0x66,0x0F,0x38,0xDB," regs "\n\t" 44032b31808SJens Wiklander #define AESKEYGENA(regs, imm) ".byte 0x66,0x0F,0x3A,0xDF," regs "," imm "\n\t" 44132b31808SJens Wiklander #define PCLMULQDQ(regs, imm) ".byte 0x66,0x0F,0x3A,0x44," regs "," imm "\n\t" 442817466cbSJens Wiklander 443817466cbSJens Wiklander #define xmm0_xmm0 "0xC0" 444817466cbSJens Wiklander #define xmm0_xmm1 "0xC8" 445817466cbSJens Wiklander #define xmm0_xmm2 "0xD0" 446817466cbSJens Wiklander #define xmm0_xmm3 "0xD8" 447817466cbSJens Wiklander #define xmm0_xmm4 "0xE0" 448817466cbSJens Wiklander #define xmm1_xmm0 "0xC1" 449817466cbSJens Wiklander #define xmm1_xmm2 "0xD1" 450817466cbSJens Wiklander 451817466cbSJens Wiklander /* 452817466cbSJens Wiklander * AES-NI AES-ECB block en(de)cryption 453817466cbSJens Wiklander */ 454817466cbSJens Wiklander int mbedtls_aesni_crypt_ecb(mbedtls_aes_context *ctx, 455817466cbSJens Wiklander int mode, 456817466cbSJens Wiklander const unsigned char input[16], 457817466cbSJens Wiklander unsigned char output[16]) 458817466cbSJens Wiklander { 459817466cbSJens Wiklander asm ("movdqu (%3), %%xmm0 \n\t" // load input 460817466cbSJens Wiklander "movdqu (%1), %%xmm1 \n\t" // load round key 0 461817466cbSJens Wiklander "pxor %%xmm1, %%xmm0 \n\t" // round 0 462817466cbSJens Wiklander "add $16, %1 \n\t" // point to next round key 463817466cbSJens Wiklander "subl $1, %0 \n\t" // normal rounds = nr - 1 464817466cbSJens Wiklander "test %2, %2 \n\t" // mode? 465817466cbSJens Wiklander "jz 2f \n\t" // 0 = decrypt 466817466cbSJens Wiklander 467817466cbSJens Wiklander "1: \n\t" // encryption loop 468817466cbSJens Wiklander "movdqu (%1), %%xmm1 \n\t" // load round key 46932b31808SJens Wiklander AESENC(xmm1_xmm0) // do round 470817466cbSJens Wiklander "add $16, %1 \n\t" // point to next round key 471817466cbSJens Wiklander "subl $1, %0 \n\t" // loop 472817466cbSJens Wiklander "jnz 1b \n\t" 473817466cbSJens Wiklander "movdqu (%1), %%xmm1 \n\t" // load round key 47432b31808SJens Wiklander AESENCLAST(xmm1_xmm0) // last round 475b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 476817466cbSJens Wiklander "jmp 3f \n\t" 477817466cbSJens Wiklander 478817466cbSJens Wiklander "2: \n\t" // decryption loop 479817466cbSJens Wiklander "movdqu (%1), %%xmm1 \n\t" 48032b31808SJens Wiklander AESDEC(xmm1_xmm0) // do round 481817466cbSJens Wiklander "add $16, %1 \n\t" 482817466cbSJens Wiklander "subl $1, %0 \n\t" 483817466cbSJens Wiklander "jnz 2b \n\t" 484817466cbSJens Wiklander "movdqu (%1), %%xmm1 \n\t" // load round key 48532b31808SJens Wiklander AESDECLAST(xmm1_xmm0) // last round 486b0563631STom Van Eyck #endif 487817466cbSJens Wiklander 488817466cbSJens Wiklander "3: \n\t" 489817466cbSJens Wiklander "movdqu %%xmm0, (%4) \n\t" // export output 490817466cbSJens Wiklander : 49132b31808SJens Wiklander : "r" (ctx->nr), "r" (ctx->buf + ctx->rk_offset), "r" (mode), "r" (input), "r" (output) 492*c3deb3d6SEtienne Carriere : "memory", "cc", "xmm0", "xmm1", "0", "1"); 493817466cbSJens Wiklander 494817466cbSJens Wiklander 49532b31808SJens Wiklander return 0; 496817466cbSJens Wiklander } 497817466cbSJens Wiklander 498817466cbSJens Wiklander /* 499817466cbSJens Wiklander * GCM multiplication: c = a times b in GF(2^128) 500817466cbSJens Wiklander * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5. 501817466cbSJens Wiklander */ 502817466cbSJens Wiklander void mbedtls_aesni_gcm_mult(unsigned char c[16], 503817466cbSJens Wiklander const unsigned char a[16], 504817466cbSJens Wiklander const unsigned char b[16]) 505817466cbSJens Wiklander { 506817466cbSJens Wiklander unsigned char aa[16], bb[16], cc[16]; 507817466cbSJens Wiklander size_t i; 508817466cbSJens Wiklander 509817466cbSJens Wiklander /* The inputs are in big-endian order, so byte-reverse them */ 51032b31808SJens Wiklander for (i = 0; i < 16; i++) { 511817466cbSJens Wiklander aa[i] = a[15 - i]; 512817466cbSJens Wiklander bb[i] = b[15 - i]; 513817466cbSJens Wiklander } 514817466cbSJens Wiklander 515817466cbSJens Wiklander asm ("movdqu (%0), %%xmm0 \n\t" // a1:a0 516817466cbSJens Wiklander "movdqu (%1), %%xmm1 \n\t" // b1:b0 517817466cbSJens Wiklander 518817466cbSJens Wiklander /* 519817466cbSJens Wiklander * Caryless multiplication xmm2:xmm1 = xmm0 * xmm1 52032b31808SJens Wiklander * using [CLMUL-WP] algorithm 1 (p. 12). 521817466cbSJens Wiklander */ 522817466cbSJens Wiklander "movdqa %%xmm1, %%xmm2 \n\t" // copy of b1:b0 523817466cbSJens Wiklander "movdqa %%xmm1, %%xmm3 \n\t" // same 524817466cbSJens Wiklander "movdqa %%xmm1, %%xmm4 \n\t" // same 52532b31808SJens Wiklander PCLMULQDQ(xmm0_xmm1, "0x00") // a0*b0 = c1:c0 52632b31808SJens Wiklander PCLMULQDQ(xmm0_xmm2, "0x11") // a1*b1 = d1:d0 52732b31808SJens Wiklander PCLMULQDQ(xmm0_xmm3, "0x10") // a0*b1 = e1:e0 52832b31808SJens Wiklander PCLMULQDQ(xmm0_xmm4, "0x01") // a1*b0 = f1:f0 529817466cbSJens Wiklander "pxor %%xmm3, %%xmm4 \n\t" // e1+f1:e0+f0 530817466cbSJens Wiklander "movdqa %%xmm4, %%xmm3 \n\t" // same 531817466cbSJens Wiklander "psrldq $8, %%xmm4 \n\t" // 0:e1+f1 532817466cbSJens Wiklander "pslldq $8, %%xmm3 \n\t" // e0+f0:0 533817466cbSJens Wiklander "pxor %%xmm4, %%xmm2 \n\t" // d1:d0+e1+f1 534817466cbSJens Wiklander "pxor %%xmm3, %%xmm1 \n\t" // c1+e0+f1:c0 535817466cbSJens Wiklander 536817466cbSJens Wiklander /* 537817466cbSJens Wiklander * Now shift the result one bit to the left, 53832b31808SJens Wiklander * taking advantage of [CLMUL-WP] eq 27 (p. 18) 539817466cbSJens Wiklander */ 540817466cbSJens Wiklander "movdqa %%xmm1, %%xmm3 \n\t" // r1:r0 541817466cbSJens Wiklander "movdqa %%xmm2, %%xmm4 \n\t" // r3:r2 542817466cbSJens Wiklander "psllq $1, %%xmm1 \n\t" // r1<<1:r0<<1 543817466cbSJens Wiklander "psllq $1, %%xmm2 \n\t" // r3<<1:r2<<1 544817466cbSJens Wiklander "psrlq $63, %%xmm3 \n\t" // r1>>63:r0>>63 545817466cbSJens Wiklander "psrlq $63, %%xmm4 \n\t" // r3>>63:r2>>63 546817466cbSJens Wiklander "movdqa %%xmm3, %%xmm5 \n\t" // r1>>63:r0>>63 547817466cbSJens Wiklander "pslldq $8, %%xmm3 \n\t" // r0>>63:0 548817466cbSJens Wiklander "pslldq $8, %%xmm4 \n\t" // r2>>63:0 549817466cbSJens Wiklander "psrldq $8, %%xmm5 \n\t" // 0:r1>>63 550817466cbSJens Wiklander "por %%xmm3, %%xmm1 \n\t" // r1<<1|r0>>63:r0<<1 551817466cbSJens Wiklander "por %%xmm4, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1 552817466cbSJens Wiklander "por %%xmm5, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1|r1>>63 553817466cbSJens Wiklander 554817466cbSJens Wiklander /* 555817466cbSJens Wiklander * Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1 55632b31808SJens Wiklander * using [CLMUL-WP] algorithm 5 (p. 18). 557817466cbSJens Wiklander * Currently xmm2:xmm1 holds x3:x2:x1:x0 (already shifted). 558817466cbSJens Wiklander */ 559817466cbSJens Wiklander /* Step 2 (1) */ 560817466cbSJens Wiklander "movdqa %%xmm1, %%xmm3 \n\t" // x1:x0 561817466cbSJens Wiklander "movdqa %%xmm1, %%xmm4 \n\t" // same 562817466cbSJens Wiklander "movdqa %%xmm1, %%xmm5 \n\t" // same 563817466cbSJens Wiklander "psllq $63, %%xmm3 \n\t" // x1<<63:x0<<63 = stuff:a 564817466cbSJens Wiklander "psllq $62, %%xmm4 \n\t" // x1<<62:x0<<62 = stuff:b 565817466cbSJens Wiklander "psllq $57, %%xmm5 \n\t" // x1<<57:x0<<57 = stuff:c 566817466cbSJens Wiklander 567817466cbSJens Wiklander /* Step 2 (2) */ 568817466cbSJens Wiklander "pxor %%xmm4, %%xmm3 \n\t" // stuff:a+b 569817466cbSJens Wiklander "pxor %%xmm5, %%xmm3 \n\t" // stuff:a+b+c 570817466cbSJens Wiklander "pslldq $8, %%xmm3 \n\t" // a+b+c:0 571817466cbSJens Wiklander "pxor %%xmm3, %%xmm1 \n\t" // x1+a+b+c:x0 = d:x0 572817466cbSJens Wiklander 573817466cbSJens Wiklander /* Steps 3 and 4 */ 574817466cbSJens Wiklander "movdqa %%xmm1,%%xmm0 \n\t" // d:x0 575817466cbSJens Wiklander "movdqa %%xmm1,%%xmm4 \n\t" // same 576817466cbSJens Wiklander "movdqa %%xmm1,%%xmm5 \n\t" // same 577817466cbSJens Wiklander "psrlq $1, %%xmm0 \n\t" // e1:x0>>1 = e1:e0' 578817466cbSJens Wiklander "psrlq $2, %%xmm4 \n\t" // f1:x0>>2 = f1:f0' 579817466cbSJens Wiklander "psrlq $7, %%xmm5 \n\t" // g1:x0>>7 = g1:g0' 580817466cbSJens Wiklander "pxor %%xmm4, %%xmm0 \n\t" // e1+f1:e0'+f0' 581817466cbSJens Wiklander "pxor %%xmm5, %%xmm0 \n\t" // e1+f1+g1:e0'+f0'+g0' 582817466cbSJens Wiklander // e0'+f0'+g0' is almost e0+f0+g0, ex\tcept for some missing 583817466cbSJens Wiklander // bits carried from d. Now get those\t bits back in. 584817466cbSJens Wiklander "movdqa %%xmm1,%%xmm3 \n\t" // d:x0 585817466cbSJens Wiklander "movdqa %%xmm1,%%xmm4 \n\t" // same 586817466cbSJens Wiklander "movdqa %%xmm1,%%xmm5 \n\t" // same 587817466cbSJens Wiklander "psllq $63, %%xmm3 \n\t" // d<<63:stuff 588817466cbSJens Wiklander "psllq $62, %%xmm4 \n\t" // d<<62:stuff 589817466cbSJens Wiklander "psllq $57, %%xmm5 \n\t" // d<<57:stuff 590817466cbSJens Wiklander "pxor %%xmm4, %%xmm3 \n\t" // d<<63+d<<62:stuff 591817466cbSJens Wiklander "pxor %%xmm5, %%xmm3 \n\t" // missing bits of d:stuff 592817466cbSJens Wiklander "psrldq $8, %%xmm3 \n\t" // 0:missing bits of d 593817466cbSJens Wiklander "pxor %%xmm3, %%xmm0 \n\t" // e1+f1+g1:e0+f0+g0 594817466cbSJens Wiklander "pxor %%xmm1, %%xmm0 \n\t" // h1:h0 595817466cbSJens Wiklander "pxor %%xmm2, %%xmm0 \n\t" // x3+h1:x2+h0 596817466cbSJens Wiklander 597817466cbSJens Wiklander "movdqu %%xmm0, (%2) \n\t" // done 598817466cbSJens Wiklander : 599817466cbSJens Wiklander : "r" (aa), "r" (bb), "r" (cc) 600817466cbSJens Wiklander : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); 601817466cbSJens Wiklander 602817466cbSJens Wiklander /* Now byte-reverse the outputs */ 60332b31808SJens Wiklander for (i = 0; i < 16; i++) { 604817466cbSJens Wiklander c[i] = cc[15 - i]; 60532b31808SJens Wiklander } 606817466cbSJens Wiklander 607817466cbSJens Wiklander return; 608817466cbSJens Wiklander } 609817466cbSJens Wiklander 610817466cbSJens Wiklander /* 611817466cbSJens Wiklander * Compute decryption round keys from encryption round keys 612817466cbSJens Wiklander */ 613b0563631STom Van Eyck #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 614817466cbSJens Wiklander void mbedtls_aesni_inverse_key(unsigned char *invkey, 615817466cbSJens Wiklander const unsigned char *fwdkey, int nr) 616817466cbSJens Wiklander { 617817466cbSJens Wiklander unsigned char *ik = invkey; 618817466cbSJens Wiklander const unsigned char *fk = fwdkey + 16 * nr; 619817466cbSJens Wiklander 620817466cbSJens Wiklander memcpy(ik, fk, 16); 621817466cbSJens Wiklander 62232b31808SJens Wiklander for (fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16) { 623817466cbSJens Wiklander asm ("movdqu (%0), %%xmm0 \n\t" 62432b31808SJens Wiklander AESIMC(xmm0_xmm0) 625817466cbSJens Wiklander "movdqu %%xmm0, (%1) \n\t" 626817466cbSJens Wiklander : 627817466cbSJens Wiklander : "r" (fk), "r" (ik) 628817466cbSJens Wiklander : "memory", "xmm0"); 62932b31808SJens Wiklander } 630817466cbSJens Wiklander 631817466cbSJens Wiklander memcpy(ik, fk, 16); 632817466cbSJens Wiklander } 633b0563631STom Van Eyck #endif 634817466cbSJens Wiklander 635817466cbSJens Wiklander /* 636817466cbSJens Wiklander * Key expansion, 128-bit case 637817466cbSJens Wiklander */ 638817466cbSJens Wiklander static void aesni_setkey_enc_128(unsigned char *rk, 639817466cbSJens Wiklander const unsigned char *key) 640817466cbSJens Wiklander { 641817466cbSJens Wiklander asm ("movdqu (%1), %%xmm0 \n\t" // copy the original key 642817466cbSJens Wiklander "movdqu %%xmm0, (%0) \n\t" // as round key 0 643817466cbSJens Wiklander "jmp 2f \n\t" // skip auxiliary routine 644817466cbSJens Wiklander 645817466cbSJens Wiklander /* 646817466cbSJens Wiklander * Finish generating the next round key. 647817466cbSJens Wiklander * 648817466cbSJens Wiklander * On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff 649817466cbSJens Wiklander * with X = rot( sub( r3 ) ) ^ RCON. 650817466cbSJens Wiklander * 651817466cbSJens Wiklander * On exit, xmm0 is r7:r6:r5:r4 652817466cbSJens Wiklander * with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3 653817466cbSJens Wiklander * and those are written to the round key buffer. 654817466cbSJens Wiklander */ 655817466cbSJens Wiklander "1: \n\t" 656817466cbSJens Wiklander "pshufd $0xff, %%xmm1, %%xmm1 \n\t" // X:X:X:X 657817466cbSJens Wiklander "pxor %%xmm0, %%xmm1 \n\t" // X+r3:X+r2:X+r1:r4 658817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" // r2:r1:r0:0 659817466cbSJens Wiklander "pxor %%xmm0, %%xmm1 \n\t" // X+r3+r2:X+r2+r1:r5:r4 660817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" // etc 661817466cbSJens Wiklander "pxor %%xmm0, %%xmm1 \n\t" 662817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" 663817466cbSJens Wiklander "pxor %%xmm1, %%xmm0 \n\t" // update xmm0 for next time! 664817466cbSJens Wiklander "add $16, %0 \n\t" // point to next round key 665817466cbSJens Wiklander "movdqu %%xmm0, (%0) \n\t" // write it 666817466cbSJens Wiklander "ret \n\t" 667817466cbSJens Wiklander 668817466cbSJens Wiklander /* Main "loop" */ 669817466cbSJens Wiklander "2: \n\t" 67032b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x01") "call 1b \n\t" 67132b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x02") "call 1b \n\t" 67232b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x04") "call 1b \n\t" 67332b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x08") "call 1b \n\t" 67432b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x10") "call 1b \n\t" 67532b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x20") "call 1b \n\t" 67632b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x40") "call 1b \n\t" 67732b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x80") "call 1b \n\t" 67832b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x1B") "call 1b \n\t" 67932b31808SJens Wiklander AESKEYGENA(xmm0_xmm1, "0x36") "call 1b \n\t" 680817466cbSJens Wiklander : 681817466cbSJens Wiklander : "r" (rk), "r" (key) 682*c3deb3d6SEtienne Carriere : "memory", "cc", "xmm0", "xmm1", "0"); 683817466cbSJens Wiklander } 684817466cbSJens Wiklander 685817466cbSJens Wiklander /* 686817466cbSJens Wiklander * Key expansion, 192-bit case 687817466cbSJens Wiklander */ 688b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH) 689817466cbSJens Wiklander static void aesni_setkey_enc_192(unsigned char *rk, 690817466cbSJens Wiklander const unsigned char *key) 691817466cbSJens Wiklander { 692817466cbSJens Wiklander asm ("movdqu (%1), %%xmm0 \n\t" // copy original round key 693817466cbSJens Wiklander "movdqu %%xmm0, (%0) \n\t" 694817466cbSJens Wiklander "add $16, %0 \n\t" 695817466cbSJens Wiklander "movq 16(%1), %%xmm1 \n\t" 696817466cbSJens Wiklander "movq %%xmm1, (%0) \n\t" 697817466cbSJens Wiklander "add $8, %0 \n\t" 698817466cbSJens Wiklander "jmp 2f \n\t" // skip auxiliary routine 699817466cbSJens Wiklander 700817466cbSJens Wiklander /* 701817466cbSJens Wiklander * Finish generating the next 6 quarter-keys. 702817466cbSJens Wiklander * 703817466cbSJens Wiklander * On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4 704817466cbSJens Wiklander * and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON. 705817466cbSJens Wiklander * 706817466cbSJens Wiklander * On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10 707817466cbSJens Wiklander * and those are written to the round key buffer. 708817466cbSJens Wiklander */ 709817466cbSJens Wiklander "1: \n\t" 710817466cbSJens Wiklander "pshufd $0x55, %%xmm2, %%xmm2 \n\t" // X:X:X:X 711817466cbSJens Wiklander "pxor %%xmm0, %%xmm2 \n\t" // X+r3:X+r2:X+r1:r4 712817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" // etc 713817466cbSJens Wiklander "pxor %%xmm0, %%xmm2 \n\t" 714817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" 715817466cbSJens Wiklander "pxor %%xmm0, %%xmm2 \n\t" 716817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" 717817466cbSJens Wiklander "pxor %%xmm2, %%xmm0 \n\t" // update xmm0 = r9:r8:r7:r6 718817466cbSJens Wiklander "movdqu %%xmm0, (%0) \n\t" 719817466cbSJens Wiklander "add $16, %0 \n\t" 720817466cbSJens Wiklander "pshufd $0xff, %%xmm0, %%xmm2 \n\t" // r9:r9:r9:r9 721817466cbSJens Wiklander "pxor %%xmm1, %%xmm2 \n\t" // stuff:stuff:r9+r5:r10 722817466cbSJens Wiklander "pslldq $4, %%xmm1 \n\t" // r2:r1:r0:0 723817466cbSJens Wiklander "pxor %%xmm2, %%xmm1 \n\t" // xmm1 = stuff:stuff:r11:r10 724817466cbSJens Wiklander "movq %%xmm1, (%0) \n\t" 725817466cbSJens Wiklander "add $8, %0 \n\t" 726817466cbSJens Wiklander "ret \n\t" 727817466cbSJens Wiklander 728817466cbSJens Wiklander "2: \n\t" 72932b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x01") "call 1b \n\t" 73032b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x02") "call 1b \n\t" 73132b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x04") "call 1b \n\t" 73232b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x08") "call 1b \n\t" 73332b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x10") "call 1b \n\t" 73432b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x20") "call 1b \n\t" 73532b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x40") "call 1b \n\t" 73632b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x80") "call 1b \n\t" 737817466cbSJens Wiklander 738817466cbSJens Wiklander : 739817466cbSJens Wiklander : "r" (rk), "r" (key) 740*c3deb3d6SEtienne Carriere : "memory", "cc", "xmm0", "xmm1", "xmm2", "0"); 741817466cbSJens Wiklander } 742b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */ 743817466cbSJens Wiklander 744817466cbSJens Wiklander /* 745817466cbSJens Wiklander * Key expansion, 256-bit case 746817466cbSJens Wiklander */ 747b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH) 748817466cbSJens Wiklander static void aesni_setkey_enc_256(unsigned char *rk, 749817466cbSJens Wiklander const unsigned char *key) 750817466cbSJens Wiklander { 751817466cbSJens Wiklander asm ("movdqu (%1), %%xmm0 \n\t" 752817466cbSJens Wiklander "movdqu %%xmm0, (%0) \n\t" 753817466cbSJens Wiklander "add $16, %0 \n\t" 754817466cbSJens Wiklander "movdqu 16(%1), %%xmm1 \n\t" 755817466cbSJens Wiklander "movdqu %%xmm1, (%0) \n\t" 756817466cbSJens Wiklander "jmp 2f \n\t" // skip auxiliary routine 757817466cbSJens Wiklander 758817466cbSJens Wiklander /* 759817466cbSJens Wiklander * Finish generating the next two round keys. 760817466cbSJens Wiklander * 761817466cbSJens Wiklander * On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and 762817466cbSJens Wiklander * xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON 763817466cbSJens Wiklander * 764817466cbSJens Wiklander * On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12 765817466cbSJens Wiklander * and those have been written to the output buffer. 766817466cbSJens Wiklander */ 767817466cbSJens Wiklander "1: \n\t" 768817466cbSJens Wiklander "pshufd $0xff, %%xmm2, %%xmm2 \n\t" 769817466cbSJens Wiklander "pxor %%xmm0, %%xmm2 \n\t" 770817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" 771817466cbSJens Wiklander "pxor %%xmm0, %%xmm2 \n\t" 772817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" 773817466cbSJens Wiklander "pxor %%xmm0, %%xmm2 \n\t" 774817466cbSJens Wiklander "pslldq $4, %%xmm0 \n\t" 775817466cbSJens Wiklander "pxor %%xmm2, %%xmm0 \n\t" 776817466cbSJens Wiklander "add $16, %0 \n\t" 777817466cbSJens Wiklander "movdqu %%xmm0, (%0) \n\t" 778817466cbSJens Wiklander 779817466cbSJens Wiklander /* Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 ) 780817466cbSJens Wiklander * and proceed to generate next round key from there */ 78132b31808SJens Wiklander AESKEYGENA(xmm0_xmm2, "0x00") 782817466cbSJens Wiklander "pshufd $0xaa, %%xmm2, %%xmm2 \n\t" 783817466cbSJens Wiklander "pxor %%xmm1, %%xmm2 \n\t" 784817466cbSJens Wiklander "pslldq $4, %%xmm1 \n\t" 785817466cbSJens Wiklander "pxor %%xmm1, %%xmm2 \n\t" 786817466cbSJens Wiklander "pslldq $4, %%xmm1 \n\t" 787817466cbSJens Wiklander "pxor %%xmm1, %%xmm2 \n\t" 788817466cbSJens Wiklander "pslldq $4, %%xmm1 \n\t" 789817466cbSJens Wiklander "pxor %%xmm2, %%xmm1 \n\t" 790817466cbSJens Wiklander "add $16, %0 \n\t" 791817466cbSJens Wiklander "movdqu %%xmm1, (%0) \n\t" 792817466cbSJens Wiklander "ret \n\t" 793817466cbSJens Wiklander 794817466cbSJens Wiklander /* 795817466cbSJens Wiklander * Main "loop" - Generating one more key than necessary, 796817466cbSJens Wiklander * see definition of mbedtls_aes_context.buf 797817466cbSJens Wiklander */ 798817466cbSJens Wiklander "2: \n\t" 79932b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x01") "call 1b \n\t" 80032b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x02") "call 1b \n\t" 80132b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x04") "call 1b \n\t" 80232b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x08") "call 1b \n\t" 80332b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x10") "call 1b \n\t" 80432b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x20") "call 1b \n\t" 80532b31808SJens Wiklander AESKEYGENA(xmm1_xmm2, "0x40") "call 1b \n\t" 806817466cbSJens Wiklander : 807817466cbSJens Wiklander : "r" (rk), "r" (key) 808*c3deb3d6SEtienne Carriere : "memory", "cc", "xmm0", "xmm1", "xmm2", "0"); 809817466cbSJens Wiklander } 810b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */ 811817466cbSJens Wiklander 81232b31808SJens Wiklander #endif /* MBEDTLS_AESNI_HAVE_CODE */ 81332b31808SJens Wiklander 814817466cbSJens Wiklander /* 815817466cbSJens Wiklander * Key expansion, wrapper 816817466cbSJens Wiklander */ 817817466cbSJens Wiklander int mbedtls_aesni_setkey_enc(unsigned char *rk, 818817466cbSJens Wiklander const unsigned char *key, 819817466cbSJens Wiklander size_t bits) 820817466cbSJens Wiklander { 82132b31808SJens Wiklander switch (bits) { 822817466cbSJens Wiklander case 128: aesni_setkey_enc_128(rk, key); break; 823b0563631STom Van Eyck #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH) 824817466cbSJens Wiklander case 192: aesni_setkey_enc_192(rk, key); break; 825817466cbSJens Wiklander case 256: aesni_setkey_enc_256(rk, key); break; 826b0563631STom Van Eyck #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */ 82732b31808SJens Wiklander default: return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH; 828817466cbSJens Wiklander } 829817466cbSJens Wiklander 83032b31808SJens Wiklander return 0; 831817466cbSJens Wiklander } 832817466cbSJens Wiklander 83332b31808SJens Wiklander #endif /* MBEDTLS_AESNI_HAVE_CODE */ 834817466cbSJens Wiklander 835817466cbSJens Wiklander #endif /* MBEDTLS_AESNI_C */ 836