xref: /optee_os/lib/libmbedtls/mbedtls/library/aesce.c (revision 32b3180828fa15a49ccc86ecb4be9d274c140c89)
1*32b31808SJens Wiklander /*
2*32b31808SJens Wiklander  *  Armv8-A Cryptographic Extension support functions for Aarch64
3*32b31808SJens Wiklander  *
4*32b31808SJens Wiklander  *  Copyright The Mbed TLS Contributors
5*32b31808SJens Wiklander  *  SPDX-License-Identifier: Apache-2.0
6*32b31808SJens Wiklander  *
7*32b31808SJens Wiklander  *  Licensed under the Apache License, Version 2.0 (the "License"); you may
8*32b31808SJens Wiklander  *  not use this file except in compliance with the License.
9*32b31808SJens Wiklander  *  You may obtain a copy of the License at
10*32b31808SJens Wiklander  *
11*32b31808SJens Wiklander  *  http://www.apache.org/licenses/LICENSE-2.0
12*32b31808SJens Wiklander  *
13*32b31808SJens Wiklander  *  Unless required by applicable law or agreed to in writing, software
14*32b31808SJens Wiklander  *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15*32b31808SJens Wiklander  *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16*32b31808SJens Wiklander  *  See the License for the specific language governing permissions and
17*32b31808SJens Wiklander  *  limitations under the License.
18*32b31808SJens Wiklander  */
19*32b31808SJens Wiklander 
20*32b31808SJens Wiklander #if defined(__aarch64__) && !defined(__ARM_FEATURE_CRYPTO) && \
21*32b31808SJens Wiklander     defined(__clang__) && __clang_major__ >= 4
22*32b31808SJens Wiklander /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
23*32b31808SJens Wiklander  *
24*32b31808SJens Wiklander  * The intrinsic declaration are guarded by predefined ACLE macros in clang:
25*32b31808SJens Wiklander  * these are normally only enabled by the -march option on the command line.
26*32b31808SJens Wiklander  * By defining the macros ourselves we gain access to those declarations without
27*32b31808SJens Wiklander  * requiring -march on the command line.
28*32b31808SJens Wiklander  *
29*32b31808SJens Wiklander  * `arm_neon.h` could be included by any header file, so we put these defines
30*32b31808SJens Wiklander  * at the top of this file, before any includes.
31*32b31808SJens Wiklander  */
32*32b31808SJens Wiklander #define __ARM_FEATURE_CRYPTO 1
33*32b31808SJens Wiklander /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
34*32b31808SJens Wiklander  *
35*32b31808SJens Wiklander  * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
36*32b31808SJens Wiklander  * for older compilers.
37*32b31808SJens Wiklander  */
38*32b31808SJens Wiklander #define __ARM_FEATURE_AES    1
39*32b31808SJens Wiklander #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
40*32b31808SJens Wiklander #endif
41*32b31808SJens Wiklander 
42*32b31808SJens Wiklander #include <string.h>
43*32b31808SJens Wiklander #include "common.h"
44*32b31808SJens Wiklander 
45*32b31808SJens Wiklander #if defined(MBEDTLS_AESCE_C)
46*32b31808SJens Wiklander 
47*32b31808SJens Wiklander #include "aesce.h"
48*32b31808SJens Wiklander 
49*32b31808SJens Wiklander #if defined(MBEDTLS_HAVE_ARM64)
50*32b31808SJens Wiklander 
51*32b31808SJens Wiklander #if !defined(__ARM_FEATURE_AES) || defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
52*32b31808SJens Wiklander #   if defined(__clang__)
53*32b31808SJens Wiklander #       if __clang_major__ < 4
54*32b31808SJens Wiklander #           error "A more recent Clang is required for MBEDTLS_AESCE_C"
55*32b31808SJens Wiklander #       endif
56*32b31808SJens Wiklander #       pragma clang attribute push (__attribute__((target("crypto"))), apply_to=function)
57*32b31808SJens Wiklander #       define MBEDTLS_POP_TARGET_PRAGMA
58*32b31808SJens Wiklander #   elif defined(__GNUC__)
59*32b31808SJens Wiklander #       if __GNUC__ < 6
60*32b31808SJens Wiklander #           error "A more recent GCC is required for MBEDTLS_AESCE_C"
61*32b31808SJens Wiklander #       endif
62*32b31808SJens Wiklander #       pragma GCC push_options
63*32b31808SJens Wiklander #       pragma GCC target ("arch=armv8-a+crypto")
64*32b31808SJens Wiklander #       define MBEDTLS_POP_TARGET_PRAGMA
65*32b31808SJens Wiklander #   else
66*32b31808SJens Wiklander #       error "Only GCC and Clang supported for MBEDTLS_AESCE_C"
67*32b31808SJens Wiklander #   endif
68*32b31808SJens Wiklander #endif /* !__ARM_FEATURE_AES || MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
69*32b31808SJens Wiklander 
70*32b31808SJens Wiklander #include <arm_neon.h>
71*32b31808SJens Wiklander 
72*32b31808SJens Wiklander #if defined(__linux__)
73*32b31808SJens Wiklander #include <asm/hwcap.h>
74*32b31808SJens Wiklander #include <sys/auxv.h>
75*32b31808SJens Wiklander #endif
76*32b31808SJens Wiklander 
77*32b31808SJens Wiklander /*
78*32b31808SJens Wiklander  * AES instruction support detection routine
79*32b31808SJens Wiklander  */
80*32b31808SJens Wiklander int mbedtls_aesce_has_support(void)
81*32b31808SJens Wiklander {
82*32b31808SJens Wiklander #if defined(__linux__)
83*32b31808SJens Wiklander     unsigned long auxval = getauxval(AT_HWCAP);
84*32b31808SJens Wiklander     return (auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
85*32b31808SJens Wiklander            (HWCAP_ASIMD | HWCAP_AES);
86*32b31808SJens Wiklander #else
87*32b31808SJens Wiklander     /* Assume AES instructions are supported. */
88*32b31808SJens Wiklander     return 1;
89*32b31808SJens Wiklander #endif
90*32b31808SJens Wiklander }
91*32b31808SJens Wiklander 
92*32b31808SJens Wiklander static uint8x16_t aesce_encrypt_block(uint8x16_t block,
93*32b31808SJens Wiklander                                       unsigned char *keys,
94*32b31808SJens Wiklander                                       int rounds)
95*32b31808SJens Wiklander {
96*32b31808SJens Wiklander     for (int i = 0; i < rounds - 1; i++) {
97*32b31808SJens Wiklander         /* AES AddRoundKey, SubBytes, ShiftRows (in this order).
98*32b31808SJens Wiklander          * AddRoundKey adds the round key for the previous round. */
99*32b31808SJens Wiklander         block = vaeseq_u8(block, vld1q_u8(keys + i * 16));
100*32b31808SJens Wiklander         /* AES mix columns */
101*32b31808SJens Wiklander         block = vaesmcq_u8(block);
102*32b31808SJens Wiklander     }
103*32b31808SJens Wiklander 
104*32b31808SJens Wiklander     /* AES AddRoundKey for the previous round.
105*32b31808SJens Wiklander      * SubBytes, ShiftRows for the final round.  */
106*32b31808SJens Wiklander     block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16));
107*32b31808SJens Wiklander 
108*32b31808SJens Wiklander     /* Final round: no MixColumns */
109*32b31808SJens Wiklander 
110*32b31808SJens Wiklander     /* Final AddRoundKey */
111*32b31808SJens Wiklander     block = veorq_u8(block, vld1q_u8(keys + rounds  * 16));
112*32b31808SJens Wiklander 
113*32b31808SJens Wiklander     return block;
114*32b31808SJens Wiklander }
115*32b31808SJens Wiklander 
116*32b31808SJens Wiklander static uint8x16_t aesce_decrypt_block(uint8x16_t block,
117*32b31808SJens Wiklander                                       unsigned char *keys,
118*32b31808SJens Wiklander                                       int rounds)
119*32b31808SJens Wiklander {
120*32b31808SJens Wiklander 
121*32b31808SJens Wiklander     for (int i = 0; i < rounds - 1; i++) {
122*32b31808SJens Wiklander         /* AES AddRoundKey, SubBytes, ShiftRows */
123*32b31808SJens Wiklander         block = vaesdq_u8(block, vld1q_u8(keys + i * 16));
124*32b31808SJens Wiklander         /* AES inverse MixColumns for the next round.
125*32b31808SJens Wiklander          *
126*32b31808SJens Wiklander          * This means that we switch the order of the inverse AddRoundKey and
127*32b31808SJens Wiklander          * inverse MixColumns operations. We have to do this as AddRoundKey is
128*32b31808SJens Wiklander          * done in an atomic instruction together with the inverses of SubBytes
129*32b31808SJens Wiklander          * and ShiftRows.
130*32b31808SJens Wiklander          *
131*32b31808SJens Wiklander          * It works because MixColumns is a linear operation over GF(2^8) and
132*32b31808SJens Wiklander          * AddRoundKey is an exclusive or, which is equivalent to addition over
133*32b31808SJens Wiklander          * GF(2^8). (The inverse of MixColumns needs to be applied to the
134*32b31808SJens Wiklander          * affected round keys separately which has been done when the
135*32b31808SJens Wiklander          * decryption round keys were calculated.) */
136*32b31808SJens Wiklander         block = vaesimcq_u8(block);
137*32b31808SJens Wiklander     }
138*32b31808SJens Wiklander 
139*32b31808SJens Wiklander     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
140*32b31808SJens Wiklander      * last full round. */
141*32b31808SJens Wiklander     block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16));
142*32b31808SJens Wiklander 
143*32b31808SJens Wiklander     /* Inverse AddRoundKey for inverting the initial round key addition. */
144*32b31808SJens Wiklander     block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
145*32b31808SJens Wiklander 
146*32b31808SJens Wiklander     return block;
147*32b31808SJens Wiklander }
148*32b31808SJens Wiklander 
149*32b31808SJens Wiklander /*
150*32b31808SJens Wiklander  * AES-ECB block en(de)cryption
151*32b31808SJens Wiklander  */
152*32b31808SJens Wiklander int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
153*32b31808SJens Wiklander                             int mode,
154*32b31808SJens Wiklander                             const unsigned char input[16],
155*32b31808SJens Wiklander                             unsigned char output[16])
156*32b31808SJens Wiklander {
157*32b31808SJens Wiklander     uint8x16_t block = vld1q_u8(&input[0]);
158*32b31808SJens Wiklander     unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
159*32b31808SJens Wiklander 
160*32b31808SJens Wiklander     if (mode == MBEDTLS_AES_ENCRYPT) {
161*32b31808SJens Wiklander         block = aesce_encrypt_block(block, keys, ctx->nr);
162*32b31808SJens Wiklander     } else {
163*32b31808SJens Wiklander         block = aesce_decrypt_block(block, keys, ctx->nr);
164*32b31808SJens Wiklander     }
165*32b31808SJens Wiklander     vst1q_u8(&output[0], block);
166*32b31808SJens Wiklander 
167*32b31808SJens Wiklander     return 0;
168*32b31808SJens Wiklander }
169*32b31808SJens Wiklander 
170*32b31808SJens Wiklander /*
171*32b31808SJens Wiklander  * Compute decryption round keys from encryption round keys
172*32b31808SJens Wiklander  */
173*32b31808SJens Wiklander void mbedtls_aesce_inverse_key(unsigned char *invkey,
174*32b31808SJens Wiklander                                const unsigned char *fwdkey,
175*32b31808SJens Wiklander                                int nr)
176*32b31808SJens Wiklander {
177*32b31808SJens Wiklander     int i, j;
178*32b31808SJens Wiklander     j = nr;
179*32b31808SJens Wiklander     vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
180*32b31808SJens Wiklander     for (i = 1, j--; j > 0; i++, j--) {
181*32b31808SJens Wiklander         vst1q_u8(invkey + i * 16,
182*32b31808SJens Wiklander                  vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
183*32b31808SJens Wiklander     }
184*32b31808SJens Wiklander     vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
185*32b31808SJens Wiklander 
186*32b31808SJens Wiklander }
187*32b31808SJens Wiklander 
188*32b31808SJens Wiklander static inline uint32_t aes_rot_word(uint32_t word)
189*32b31808SJens Wiklander {
190*32b31808SJens Wiklander     return (word << (32 - 8)) | (word >> 8);
191*32b31808SJens Wiklander }
192*32b31808SJens Wiklander 
193*32b31808SJens Wiklander static inline uint32_t aes_sub_word(uint32_t in)
194*32b31808SJens Wiklander {
195*32b31808SJens Wiklander     uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
196*32b31808SJens Wiklander     uint8x16_t zero = vdupq_n_u8(0);
197*32b31808SJens Wiklander 
198*32b31808SJens Wiklander     /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
199*32b31808SJens Wiklander      * the correct result as ShiftRows doesn't change the first row. */
200*32b31808SJens Wiklander     v = vaeseq_u8(zero, v);
201*32b31808SJens Wiklander     return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
202*32b31808SJens Wiklander }
203*32b31808SJens Wiklander 
204*32b31808SJens Wiklander /*
205*32b31808SJens Wiklander  * Key expansion function
206*32b31808SJens Wiklander  */
207*32b31808SJens Wiklander static void aesce_setkey_enc(unsigned char *rk,
208*32b31808SJens Wiklander                              const unsigned char *key,
209*32b31808SJens Wiklander                              const size_t key_bit_length)
210*32b31808SJens Wiklander {
211*32b31808SJens Wiklander     static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
212*32b31808SJens Wiklander                                     0x20, 0x40, 0x80, 0x1b, 0x36 };
213*32b31808SJens Wiklander     /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
214*32b31808SJens Wiklander      *   - Section 5, Nr = Nk + 6
215*32b31808SJens Wiklander      *   - Section 5.2, the length of round keys is Nb*(Nr+1)
216*32b31808SJens Wiklander      */
217*32b31808SJens Wiklander     const uint32_t key_len_in_words = key_bit_length / 32;  /* Nk */
218*32b31808SJens Wiklander     const size_t round_key_len_in_words = 4;                /* Nb */
219*32b31808SJens Wiklander     const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
220*32b31808SJens Wiklander     const size_t round_keys_len_in_words =
221*32b31808SJens Wiklander         round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
222*32b31808SJens Wiklander     const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
223*32b31808SJens Wiklander 
224*32b31808SJens Wiklander     memcpy(rk, key, key_len_in_words * 4);
225*32b31808SJens Wiklander 
226*32b31808SJens Wiklander     for (uint32_t *rki = (uint32_t *) rk;
227*32b31808SJens Wiklander          rki + key_len_in_words < rko_end;
228*32b31808SJens Wiklander          rki += key_len_in_words) {
229*32b31808SJens Wiklander 
230*32b31808SJens Wiklander         size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words;
231*32b31808SJens Wiklander         uint32_t *rko;
232*32b31808SJens Wiklander         rko = rki + key_len_in_words;
233*32b31808SJens Wiklander         rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
234*32b31808SJens Wiklander         rko[0] ^= rcon[iteration] ^ rki[0];
235*32b31808SJens Wiklander         rko[1] = rko[0] ^ rki[1];
236*32b31808SJens Wiklander         rko[2] = rko[1] ^ rki[2];
237*32b31808SJens Wiklander         rko[3] = rko[2] ^ rki[3];
238*32b31808SJens Wiklander         if (rko + key_len_in_words > rko_end) {
239*32b31808SJens Wiklander             /* Do not write overflow words.*/
240*32b31808SJens Wiklander             continue;
241*32b31808SJens Wiklander         }
242*32b31808SJens Wiklander         switch (key_bit_length) {
243*32b31808SJens Wiklander             case 128:
244*32b31808SJens Wiklander                 break;
245*32b31808SJens Wiklander             case 192:
246*32b31808SJens Wiklander                 rko[4] = rko[3] ^ rki[4];
247*32b31808SJens Wiklander                 rko[5] = rko[4] ^ rki[5];
248*32b31808SJens Wiklander                 break;
249*32b31808SJens Wiklander             case 256:
250*32b31808SJens Wiklander                 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
251*32b31808SJens Wiklander                 rko[5] = rko[4] ^ rki[5];
252*32b31808SJens Wiklander                 rko[6] = rko[5] ^ rki[6];
253*32b31808SJens Wiklander                 rko[7] = rko[6] ^ rki[7];
254*32b31808SJens Wiklander                 break;
255*32b31808SJens Wiklander         }
256*32b31808SJens Wiklander     }
257*32b31808SJens Wiklander }
258*32b31808SJens Wiklander 
259*32b31808SJens Wiklander /*
260*32b31808SJens Wiklander  * Key expansion, wrapper
261*32b31808SJens Wiklander  */
262*32b31808SJens Wiklander int mbedtls_aesce_setkey_enc(unsigned char *rk,
263*32b31808SJens Wiklander                              const unsigned char *key,
264*32b31808SJens Wiklander                              size_t bits)
265*32b31808SJens Wiklander {
266*32b31808SJens Wiklander     switch (bits) {
267*32b31808SJens Wiklander         case 128:
268*32b31808SJens Wiklander         case 192:
269*32b31808SJens Wiklander         case 256:
270*32b31808SJens Wiklander             aesce_setkey_enc(rk, key, bits);
271*32b31808SJens Wiklander             break;
272*32b31808SJens Wiklander         default:
273*32b31808SJens Wiklander             return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
274*32b31808SJens Wiklander     }
275*32b31808SJens Wiklander 
276*32b31808SJens Wiklander     return 0;
277*32b31808SJens Wiklander }
278*32b31808SJens Wiklander 
279*32b31808SJens Wiklander #if defined(MBEDTLS_GCM_C)
280*32b31808SJens Wiklander 
281*32b31808SJens Wiklander #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5
282*32b31808SJens Wiklander /* Some intrinsics are not available for GCC 5.X. */
283*32b31808SJens Wiklander #define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
284*32b31808SJens Wiklander #define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
285*32b31808SJens Wiklander static inline poly64_t vget_low_p64(poly64x2_t __a)
286*32b31808SJens Wiklander {
287*32b31808SJens Wiklander     uint64x2_t tmp = (uint64x2_t) (__a);
288*32b31808SJens Wiklander     uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0));
289*32b31808SJens Wiklander     return (poly64_t) (lo);
290*32b31808SJens Wiklander }
291*32b31808SJens Wiklander #endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/
292*32b31808SJens Wiklander 
293*32b31808SJens Wiklander /* vmull_p64/vmull_high_p64 wrappers.
294*32b31808SJens Wiklander  *
295*32b31808SJens Wiklander  * Older compilers miss some intrinsic functions for `poly*_t`. We use
296*32b31808SJens Wiklander  * uint8x16_t and uint8x16x3_t as input/output parameters.
297*32b31808SJens Wiklander  */
298*32b31808SJens Wiklander static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
299*32b31808SJens Wiklander {
300*32b31808SJens Wiklander     return vreinterpretq_u8_p128(
301*32b31808SJens Wiklander         vmull_p64(
302*32b31808SJens Wiklander             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
303*32b31808SJens Wiklander             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))));
304*32b31808SJens Wiklander }
305*32b31808SJens Wiklander 
306*32b31808SJens Wiklander static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
307*32b31808SJens Wiklander {
308*32b31808SJens Wiklander     return vreinterpretq_u8_p128(
309*32b31808SJens Wiklander         vmull_high_p64(vreinterpretq_p64_u8(a),
310*32b31808SJens Wiklander                        vreinterpretq_p64_u8(b)));
311*32b31808SJens Wiklander }
312*32b31808SJens Wiklander 
313*32b31808SJens Wiklander /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
314*32b31808SJens Wiklander  * `x^128 + x^7 + x^2 + x + 1`.
315*32b31808SJens Wiklander  *
316*32b31808SJens Wiklander  * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
317*32b31808SJens Wiklander  * multiplies to generate a 128b.
318*32b31808SJens Wiklander  *
319*32b31808SJens Wiklander  * `poly_mult_128` executes polynomial multiplication and outputs 256b that
320*32b31808SJens Wiklander  * represented by 3 128b due to code size optimization.
321*32b31808SJens Wiklander  *
322*32b31808SJens Wiklander  * Output layout:
323*32b31808SJens Wiklander  * |            |             |             |
324*32b31808SJens Wiklander  * |------------|-------------|-------------|
325*32b31808SJens Wiklander  * | ret.val[0] | h3:h2:00:00 | high   128b |
326*32b31808SJens Wiklander  * | ret.val[1] |   :m2:m1:00 | middle 128b |
327*32b31808SJens Wiklander  * | ret.val[2] |   :  :l1:l0 | low    128b |
328*32b31808SJens Wiklander  */
329*32b31808SJens Wiklander static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
330*32b31808SJens Wiklander {
331*32b31808SJens Wiklander     uint8x16x3_t ret;
332*32b31808SJens Wiklander     uint8x16_t h, m, l; /* retval high/middle/low */
333*32b31808SJens Wiklander     uint8x16_t c, d, e;
334*32b31808SJens Wiklander 
335*32b31808SJens Wiklander     h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
336*32b31808SJens Wiklander     l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
337*32b31808SJens Wiklander     c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
338*32b31808SJens Wiklander     d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
339*32b31808SJens Wiklander     e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
340*32b31808SJens Wiklander     m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
341*32b31808SJens Wiklander 
342*32b31808SJens Wiklander     ret.val[0] = h;
343*32b31808SJens Wiklander     ret.val[1] = m;
344*32b31808SJens Wiklander     ret.val[2] = l;
345*32b31808SJens Wiklander     return ret;
346*32b31808SJens Wiklander }
347*32b31808SJens Wiklander 
348*32b31808SJens Wiklander /*
349*32b31808SJens Wiklander  * Modulo reduction.
350*32b31808SJens Wiklander  *
351*32b31808SJens Wiklander  * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
352*32b31808SJens Wiklander  *
353*32b31808SJens Wiklander  * Section 4.3
354*32b31808SJens Wiklander  *
355*32b31808SJens Wiklander  * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
356*32b31808SJens Wiklander  * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
357*32b31808SJens Wiklander  * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
358*32b31808SJens Wiklander  * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
359*32b31808SJens Wiklander  * simply multiply the higher part of the operand by r(z) and add it to l(z). If
360*32b31808SJens Wiklander  * the result is still larger than 128 bits, we reduce again.
361*32b31808SJens Wiklander  */
362*32b31808SJens Wiklander static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
363*32b31808SJens Wiklander {
364*32b31808SJens Wiklander     uint8x16_t const ZERO = vdupq_n_u8(0);
365*32b31808SJens Wiklander     /* use 'asm' as an optimisation barrier to prevent loading MODULO from memory */
366*32b31808SJens Wiklander     uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
367*32b31808SJens Wiklander     asm ("" : "+w" (r));
368*32b31808SJens Wiklander     uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
369*32b31808SJens Wiklander     uint8x16_t h, m, l; /* input high/middle/low 128b */
370*32b31808SJens Wiklander     uint8x16_t c, d, e, f, g, n, o;
371*32b31808SJens Wiklander     h = input.val[0];            /* h3:h2:00:00                          */
372*32b31808SJens Wiklander     m = input.val[1];            /*   :m2:m1:00                          */
373*32b31808SJens Wiklander     l = input.val[2];            /*   :  :l1:l0                          */
374*32b31808SJens Wiklander     c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
375*32b31808SJens Wiklander     d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
376*32b31808SJens Wiklander     e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
377*32b31808SJens Wiklander     f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
378*32b31808SJens Wiklander     g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
379*32b31808SJens Wiklander     n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
380*32b31808SJens Wiklander     o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
381*32b31808SJens Wiklander     return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
382*32b31808SJens Wiklander }
383*32b31808SJens Wiklander 
384*32b31808SJens Wiklander /*
385*32b31808SJens Wiklander  * GCM multiplication: c = a times b in GF(2^128)
386*32b31808SJens Wiklander  */
387*32b31808SJens Wiklander void mbedtls_aesce_gcm_mult(unsigned char c[16],
388*32b31808SJens Wiklander                             const unsigned char a[16],
389*32b31808SJens Wiklander                             const unsigned char b[16])
390*32b31808SJens Wiklander {
391*32b31808SJens Wiklander     uint8x16_t va, vb, vc;
392*32b31808SJens Wiklander     va = vrbitq_u8(vld1q_u8(&a[0]));
393*32b31808SJens Wiklander     vb = vrbitq_u8(vld1q_u8(&b[0]));
394*32b31808SJens Wiklander     vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
395*32b31808SJens Wiklander     vst1q_u8(&c[0], vc);
396*32b31808SJens Wiklander }
397*32b31808SJens Wiklander 
398*32b31808SJens Wiklander #endif /* MBEDTLS_GCM_C */
399*32b31808SJens Wiklander 
400*32b31808SJens Wiklander #if defined(MBEDTLS_POP_TARGET_PRAGMA)
401*32b31808SJens Wiklander #if defined(__clang__)
402*32b31808SJens Wiklander #pragma clang attribute pop
403*32b31808SJens Wiklander #elif defined(__GNUC__)
404*32b31808SJens Wiklander #pragma GCC pop_options
405*32b31808SJens Wiklander #endif
406*32b31808SJens Wiklander #undef MBEDTLS_POP_TARGET_PRAGMA
407*32b31808SJens Wiklander #endif
408*32b31808SJens Wiklander 
409*32b31808SJens Wiklander #endif /* MBEDTLS_HAVE_ARM64 */
410*32b31808SJens Wiklander 
411*32b31808SJens Wiklander #endif /* MBEDTLS_AESCE_C */
412