1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <assert.h>
5 #include "aes_core.h"
6 #include "aes_locl.h"
7
8 #define OPENSSL_FIPSAPI
9 #define TABLE_BITS 1
10 #include <string.h>
11 #define DEBUG(format,...) printf("[%s]:%d: "format"\n", __func__,__LINE__, ##__VA_ARGS__)
12
13 #ifndef MODES_DEBUG
14 # ifndef NDEBUG
15 # define NDEBUG
16 # endif
17 #endif
18 #include <assert.h>
19
20 #if defined(BSWAP4)
21 /* redefine, because alignment is ensured */
22 #undef GETU32
23 #define GETU32(p) BSWAP4(*(const u32 *)(p))
24 #undef PUTU32
25 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
26 #endif
27
28 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
29 #define REDUCE1BIT(V) do { \
30 if (sizeof(size_t)==8) { \
31 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32 V.lo = (V.hi<<63)|(V.lo>>1); \
33 V.hi = (V.hi>>1 )^T; \
34 } \
35 else { \
36 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37 V.lo = (V.hi<<63)|(V.lo>>1); \
38 V.hi = (V.hi>>1 )^((u64)T<<32); \
39 } \
40 } while(0)
41
42 typedef struct { u64 hi,lo; } u128;
43
44 typedef void (*block128_f)(const unsigned char in[16],
45 unsigned char out[16],
46 const void *key);
47
48 typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
49 unsigned int blocks, const void *key,
50 const unsigned char ivec[16]);
51
52
53 struct gcm128_context {
54 /* Following 6 names follow names in GCM specification */
55 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len,
56 Xi,H;
57 /* Relative position of Xi, H and pre-computed Htable is used
58 * in some assembler modules, i.e. don't change the order! */
59 #if TABLE_BITS==8
60 u128 Htable[256];
61 #else
62 u128 Htable[16];
63 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
64 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
65 #endif
66 unsigned int mres, ares;
67 block128_f block;
68 void *key;
69 };
70
71 typedef struct gcm128_context GCM128_CONTEXT;
72
73 /*
74 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
75 * never be set to 8. 8 is effectively reserved for testing purposes.
76 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
77 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
78 * whole spectrum of possible table driven implementations. Why? In
79 * non-"Shoup's" case memory access pattern is segmented in such manner,
80 * that it's trivial to see that cache timing information can reveal
81 * fair portion of intermediate hash value. Given that ciphertext is
82 * always available to attacker, it's possible for him to attempt to
83 * deduce secret parameter H and if successful, tamper with messages
84 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
85 * not as trivial, but there is no reason to believe that it's resistant
86 * to cache-timing attack. And the thing about "8-bit" implementation is
87 * that it consumes 16 (sixteen) times more memory, 4KB per individual
88 * key + 1KB shared. Well, on pros side it should be twice as fast as
89 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
90 * was observed to run ~75% faster, closer to 100% for commercial
91 * compilers... Yet "4-bit" procedure is preferred, because it's
92 * believed to provide better security-performance balance and adequate
93 * all-round performance. "All-round" refers to things like:
94 *
95 * - shorter setup time effectively improves overall timing for
96 * handling short messages;
97 * - larger table allocation can become unbearable because of VM
98 * subsystem penalties (for example on Windows large enough free
99 * results in VM working set trimming, meaning that consequent
100 * malloc would immediately incur working set expansion);
101 * - larger table has larger cache footprint, which can affect
102 * performance of other code paths (not necessarily even from same
103 * thread in Hyper-Threading world);
104 *
105 * Value of 1 is not appropriate for performance reasons.
106 */
107 #if TABLE_BITS==8
108
gcm_init_8bit(u128 Htable[256],u64 H[2])109 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
110 {
111 int i, j;
112 u128 V;
113
114 Htable[0].hi = 0;
115 Htable[0].lo = 0;
116 V.hi = H[0];
117 V.lo = H[1];
118
119 for (Htable[128]=V, i=64; i>0; i>>=1) {
120 REDUCE1BIT(V);
121 Htable[i] = V;
122 }
123
124 for (i=2; i<256; i<<=1) {
125 u128 *Hi = Htable+i, H0 = *Hi;
126 for (j=1; j<i; ++j) {
127 Hi[j].hi = H0.hi^Htable[j].hi;
128 Hi[j].lo = H0.lo^Htable[j].lo;
129 }
130 }
131 }
132
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])133 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
134 {
135 u128 Z = { 0, 0};
136 const u8 *xi = (const u8 *)Xi+15;
137 size_t rem, n = *xi;
138 const union { long one; char little; } is_endian = {1};
139 static const size_t rem_8bit[256] = {
140 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
141 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
142 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
143 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
144 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
145 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
146 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
147 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
148 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
149 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
150 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
151 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
152 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
153 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
154 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
155 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
156 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
157 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
158 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
159 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
160 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
161 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
162 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
163 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
164 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
165 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
166 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
167 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
168 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
169 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
170 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
171 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
172 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
173 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
174 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
175 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
176 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
177 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
178 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
179 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
180 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
181 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
182 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
183 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
184 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
185 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
186 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
187 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
188 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
189 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
190 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
191 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
192 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
193 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
194 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
195 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
196 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
197 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
198 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
199 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
200 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
201 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
202 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
203 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
204
205 while (1) {
206 Z.hi ^= Htable[n].hi;
207 Z.lo ^= Htable[n].lo;
208
209 if ((u8 *)Xi==xi) break;
210
211 n = *(--xi);
212
213 rem = (size_t)Z.lo&0xff;
214 Z.lo = (Z.hi<<56)|(Z.lo>>8);
215 Z.hi = (Z.hi>>8);
216 if (sizeof(size_t)==8)
217 Z.hi ^= rem_8bit[rem];
218 else
219 Z.hi ^= (u64)rem_8bit[rem]<<32;
220 }
221
222 if (is_endian.little) {
223 #ifdef BSWAP8
224 Xi[0] = BSWAP8(Z.hi);
225 Xi[1] = BSWAP8(Z.lo);
226 #else
227 u8 *p = (u8 *)Xi;
228 u32 v;
229 v = (u32)(Z.hi>>32); PUTU32(p,v);
230 v = (u32)(Z.hi); PUTU32(p+4,v);
231 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
232 v = (u32)(Z.lo); PUTU32(p+12,v);
233 #endif
234 }
235 else {
236 Xi[0] = Z.hi;
237 Xi[1] = Z.lo;
238 }
239 }
240 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
241
242 #elif TABLE_BITS==4
243
gcm_init_4bit(u128 Htable[16],u64 H[2])244 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
245 {
246 u128 V;
247 #if defined(OPENSSL_SMALL_FOOTPRINT)
248 int i;
249 #endif
250
251 Htable[0].hi = 0;
252 Htable[0].lo = 0;
253 V.hi = H[0];
254 V.lo = H[1];
255
256 #if defined(OPENSSL_SMALL_FOOTPRINT)
257 for (Htable[8]=V, i=4; i>0; i>>=1) {
258 REDUCE1BIT(V);
259 Htable[i] = V;
260 }
261
262 for (i=2; i<16; i<<=1) {
263 u128 *Hi = Htable+i;
264 int j;
265 for (V=*Hi, j=1; j<i; ++j) {
266 Hi[j].hi = V.hi^Htable[j].hi;
267 Hi[j].lo = V.lo^Htable[j].lo;
268 }
269 }
270 #else
271 Htable[8] = V;
272 REDUCE1BIT(V);
273 Htable[4] = V;
274 REDUCE1BIT(V);
275 Htable[2] = V;
276 REDUCE1BIT(V);
277 Htable[1] = V;
278 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
279 V=Htable[4];
280 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
281 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
282 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
283 V=Htable[8];
284 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
285 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
286 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
287 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
288 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
289 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
290 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
291 #endif
292 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
293 /*
294 * ARM assembler expects specific dword order in Htable.
295 */
296 {
297 int j;
298 const union { long one; char little; } is_endian = {1};
299
300 if (is_endian.little)
301 for (j=0;j<16;++j) {
302 V = Htable[j];
303 Htable[j].hi = V.lo;
304 Htable[j].lo = V.hi;
305 }
306 else
307 for (j=0;j<16;++j) {
308 V = Htable[j];
309 Htable[j].hi = V.lo<<32|V.lo>>32;
310 Htable[j].lo = V.hi<<32|V.hi>>32;
311 }
312 }
313 #endif
314 }
315
316 #ifndef GHASH_ASM
317 static const size_t rem_4bit[16] = {
318 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
319 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
320 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
321 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
322
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])323 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
324 {
325 u128 Z;
326 int cnt = 15;
327 size_t rem, nlo, nhi;
328 const union { long one; char little; } is_endian = {1};
329
330 nlo = ((const u8 *)Xi)[15];
331 nhi = nlo>>4;
332 nlo &= 0xf;
333
334 Z.hi = Htable[nlo].hi;
335 Z.lo = Htable[nlo].lo;
336
337 while (1) {
338 rem = (size_t)Z.lo&0xf;
339 Z.lo = (Z.hi<<60)|(Z.lo>>4);
340 Z.hi = (Z.hi>>4);
341 if (sizeof(size_t)==8)
342 Z.hi ^= rem_4bit[rem];
343 else
344 Z.hi ^= (u64)rem_4bit[rem]<<32;
345
346 Z.hi ^= Htable[nhi].hi;
347 Z.lo ^= Htable[nhi].lo;
348
349 if (--cnt<0) break;
350
351 nlo = ((const u8 *)Xi)[cnt];
352 nhi = nlo>>4;
353 nlo &= 0xf;
354
355 rem = (size_t)Z.lo&0xf;
356 Z.lo = (Z.hi<<60)|(Z.lo>>4);
357 Z.hi = (Z.hi>>4);
358 if (sizeof(size_t)==8)
359 Z.hi ^= rem_4bit[rem];
360 else
361 Z.hi ^= (u64)rem_4bit[rem]<<32;
362
363 Z.hi ^= Htable[nlo].hi;
364 Z.lo ^= Htable[nlo].lo;
365 }
366
367 if (is_endian.little) {
368 #ifdef BSWAP8
369 Xi[0] = BSWAP8(Z.hi);
370 Xi[1] = BSWAP8(Z.lo);
371 #else
372 u8 *p = (u8 *)Xi;
373 u32 v;
374 v = (u32)(Z.hi>>32); PUTU32(p,v);
375 v = (u32)(Z.hi); PUTU32(p+4,v);
376 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
377 v = (u32)(Z.lo); PUTU32(p+12,v);
378 #endif
379 }
380 else {
381 Xi[0] = Z.hi;
382 Xi[1] = Z.lo;
383 }
384 }
385
386 #if !defined(OPENSSL_SMALL_FOOTPRINT)
387 /*
388 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
389 * details... Compiler-generated code doesn't seem to give any
390 * performance improvement, at least not on x86[_64]. It's here
391 * mostly as reference and a placeholder for possible future
392 * non-trivial optimization[s]...
393 */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)394 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
395 const u8 *inp,size_t len)
396 {
397 u128 Z;
398 int cnt;
399 size_t rem, nlo, nhi;
400 const union { long one; char little; } is_endian = {1};
401
402 #if 1
403 do {
404 cnt = 15;
405 nlo = ((const u8 *)Xi)[15];
406 nlo ^= inp[15];
407 nhi = nlo>>4;
408 nlo &= 0xf;
409
410 Z.hi = Htable[nlo].hi;
411 Z.lo = Htable[nlo].lo;
412
413 while (1) {
414 rem = (size_t)Z.lo&0xf;
415 Z.lo = (Z.hi<<60)|(Z.lo>>4);
416 Z.hi = (Z.hi>>4);
417 if (sizeof(size_t)==8)
418 Z.hi ^= rem_4bit[rem];
419 else
420 Z.hi ^= (u64)rem_4bit[rem]<<32;
421
422 Z.hi ^= Htable[nhi].hi;
423 Z.lo ^= Htable[nhi].lo;
424
425 if (--cnt<0) break;
426
427 nlo = ((const u8 *)Xi)[cnt];
428 nlo ^= inp[cnt];
429 nhi = nlo>>4;
430 nlo &= 0xf;
431
432 rem = (size_t)Z.lo&0xf;
433 Z.lo = (Z.hi<<60)|(Z.lo>>4);
434 Z.hi = (Z.hi>>4);
435 if (sizeof(size_t)==8)
436 Z.hi ^= rem_4bit[rem];
437 else
438 Z.hi ^= (u64)rem_4bit[rem]<<32;
439
440 Z.hi ^= Htable[nlo].hi;
441 Z.lo ^= Htable[nlo].lo;
442 }
443 #else
444 /*
445 * Extra 256+16 bytes per-key plus 512 bytes shared tables
446 * [should] give ~50% improvement... One could have PACK()-ed
447 * the rem_8bit even here, but the priority is to minimize
448 * cache footprint...
449 */
450 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
451 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
452 static const unsigned short rem_8bit[256] = {
453 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
454 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
455 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
456 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
457 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
458 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
459 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
460 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
461 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
462 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
463 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
464 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
465 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
466 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
467 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
468 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
469 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
470 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
471 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
472 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
473 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
474 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
475 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
476 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
477 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
478 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
479 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
480 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
481 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
482 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
483 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
484 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
485 /*
486 * This pre-processing phase slows down procedure by approximately
487 * same time as it makes each loop spin faster. In other words
488 * single block performance is approximately same as straightforward
489 * "4-bit" implementation, and then it goes only faster...
490 */
491 for (cnt=0; cnt<16; ++cnt) {
492 Z.hi = Htable[cnt].hi;
493 Z.lo = Htable[cnt].lo;
494 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
495 Hshr4[cnt].hi = (Z.hi>>4);
496 Hshl4[cnt] = (u8)(Z.lo<<4);
497 }
498
499 do {
500 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
501 nlo = ((const u8 *)Xi)[cnt];
502 nlo ^= inp[cnt];
503 nhi = nlo>>4;
504 nlo &= 0xf;
505
506 Z.hi ^= Htable[nlo].hi;
507 Z.lo ^= Htable[nlo].lo;
508
509 rem = (size_t)Z.lo&0xff;
510
511 Z.lo = (Z.hi<<56)|(Z.lo>>8);
512 Z.hi = (Z.hi>>8);
513
514 Z.hi ^= Hshr4[nhi].hi;
515 Z.lo ^= Hshr4[nhi].lo;
516 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
517 }
518
519 nlo = ((const u8 *)Xi)[0];
520 nlo ^= inp[0];
521 nhi = nlo>>4;
522 nlo &= 0xf;
523
524 Z.hi ^= Htable[nlo].hi;
525 Z.lo ^= Htable[nlo].lo;
526
527 rem = (size_t)Z.lo&0xf;
528
529 Z.lo = (Z.hi<<60)|(Z.lo>>4);
530 Z.hi = (Z.hi>>4);
531
532 Z.hi ^= Htable[nhi].hi;
533 Z.lo ^= Htable[nhi].lo;
534 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
535 #endif
536
537 if (is_endian.little) {
538 #ifdef BSWAP8
539 Xi[0] = BSWAP8(Z.hi);
540 Xi[1] = BSWAP8(Z.lo);
541 #else
542 u8 *p = (u8 *)Xi;
543 u32 v;
544 v = (u32)(Z.hi>>32); PUTU32(p,v);
545 v = (u32)(Z.hi); PUTU32(p+4,v);
546 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
547 v = (u32)(Z.lo); PUTU32(p+12,v);
548 #endif
549 }
550 else {
551 Xi[0] = Z.hi;
552 Xi[1] = Z.lo;
553 }
554 } while (inp+=16, len-=16);
555 }
556 #endif
557 #else
558 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
559 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
560 #endif
561
562 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
563 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
564 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
565 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
566 * trashing effect. In other words idea is to hash data while it's
567 * still in L1 cache after encryption pass... */
568 #define GHASH_CHUNK (3*1024)
569 #endif
570
571 #else /* TABLE_BITS */
572
573 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
574 {
575 u128 V,Z = { 0,0 };
576 long X;
577 unsigned int i,j;
578 const long *xi = (const long *)Xi;
579 const union { long one; char little; } is_endian = {1};
580
581 V.hi = H[0]; /* H is in host byte order, no byte swapping */
582 V.lo = H[1];
583
584 for (j=0; j<16/sizeof(long); ++j) {
585 if (is_endian.little) {
586 if (sizeof(long)==8) {
587 #ifdef BSWAP8
588 X = (long)(BSWAP8(xi[j]));
589 #else
590 const u8 *p = (const u8 *)(xi+j);
591 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
592 #endif
593 }
594 else {
595 const u8 *p = (const u8 *)(xi+j);
596 X = (long)GETU32(p);
597 }
598 }
599 else
600 X = xi[j];
601
602 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
603 u64 M = (u64)(X>>(8*sizeof(long)-1));
604 Z.hi ^= V.hi&M;
605 Z.lo ^= V.lo&M;
606
607 REDUCE1BIT(V);
608 }
609 }
610
611 if (is_endian.little) {
612 #ifdef BSWAP8
613 Xi[0] = BSWAP8(Z.hi);
614 Xi[1] = BSWAP8(Z.lo);
615 #else
616 u8 *p = (u8 *)Xi;
617 u32 v;
618 v = (u32)(Z.hi>>32); PUTU32(p,v);
619 v = (u32)(Z.hi); PUTU32(p+4,v);
620 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
621 v = (u32)(Z.lo); PUTU32(p+12,v);
622 #endif
623 }
624 else {
625 Xi[0] = Z.hi;
626 Xi[1] = Z.lo;
627 }
628 }
629 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
630
631 #endif
632
633 #if TABLE_BITS==4 && defined(GHASH_ASM)
634 # if !defined(I386_ONLY) && \
635 (defined(__i386) || defined(__i386__) || \
636 defined(__x86_64) || defined(__x86_64__) || \
637 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
638 # define GHASH_ASM_X86_OR_64
639 # define GCM_FUNCREF_4BIT
640 extern unsigned int OPENSSL_ia32cap_P[2];
641
642 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
643 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
644 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
645
646 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
647 # define GHASH_ASM_X86
648 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
649 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
650
651 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
652 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
653 # endif
654 # elif defined(__arm__) || defined(__arm)
655 # include "arm_arch.h"
656 # if __ARM_ARCH__>=7
657 # define GHASH_ASM_ARM
658 # define GCM_FUNCREF_4BIT
659 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
660 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 # endif
662 # endif
663 #endif
664
665 #ifdef GCM_FUNCREF_4BIT
666 # undef GCM_MUL
667 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
668 # ifdef GHASH
669 # undef GHASH
670 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
671 # endif
672 #endif
673
674 static void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
675 {
676 const union { long one; char little; } is_endian = {1};
677
678 memset(ctx,0,sizeof(*ctx));
679 ctx->block = block;
680 ctx->key = key;
681
682 (*block)(ctx->H.c, ctx->H.c,key);
683
684 if (is_endian.little) {
685 /* H is stored in host byte order */
686 #ifdef BSWAP8
687 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
688 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
689 #else
690 u8 *p = ctx->H.c;
691 u64 hi,lo;
692 hi = (u64)GETU32(p) <<32|GETU32(p+4);
693 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
694 ctx->H.u[0] = hi;
695 ctx->H.u[1] = lo;
696 #endif
697 }
698
699 #if TABLE_BITS==8
700 gcm_init_8bit(ctx->Htable,ctx->H.u);
701 #elif TABLE_BITS==4
702 # if defined(GHASH_ASM_X86_OR_64)
703 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
704 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
705 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
706 gcm_init_clmul(ctx->Htable,ctx->H.u);
707 ctx->gmult = gcm_gmult_clmul;
708 ctx->ghash = gcm_ghash_clmul;
709 return;
710 }
711 # endif
712 gcm_init_4bit(ctx->Htable,ctx->H.u);
713 # if defined(GHASH_ASM_X86) /* x86 only */
714 # if defined(OPENSSL_IA32_SSE2)
715 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
716 # else
717 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
718 # endif
719 ctx->gmult = gcm_gmult_4bit_mmx;
720 ctx->ghash = gcm_ghash_4bit_mmx;
721 } else {
722 ctx->gmult = gcm_gmult_4bit_x86;
723 ctx->ghash = gcm_ghash_4bit_x86;
724 }
725 # else
726 ctx->gmult = gcm_gmult_4bit;
727 ctx->ghash = gcm_ghash_4bit;
728 # endif
729 # elif defined(GHASH_ASM_ARM)
730 if (OPENSSL_armcap_P & ARMV7_NEON) {
731 ctx->gmult = gcm_gmult_neon;
732 ctx->ghash = gcm_ghash_neon;
733 } else {
734 gcm_init_4bit(ctx->Htable,ctx->H.u);
735 ctx->gmult = gcm_gmult_4bit;
736 ctx->ghash = gcm_ghash_4bit;
737 }
738 # else
739 gcm_init_4bit(ctx->Htable,ctx->H.u);
740 # endif
741 #endif
742 }
743
744 static void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
745 {
746 const union { long one; char little; } is_endian = {1};
747 unsigned int ctr;
748 #ifdef GCM_FUNCREF_4BIT
749 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
750 #endif
751
752 ctx->Yi.u[0] = 0;
753 ctx->Yi.u[1] = 0;
754 ctx->Xi.u[0] = 0;
755 ctx->Xi.u[1] = 0;
756 ctx->len.u[0] = 0; /* AAD length */
757 ctx->len.u[1] = 0; /* message length */
758 ctx->ares = 0;
759 ctx->mres = 0;
760
761 if (len==12) {
762 memcpy(ctx->Yi.c,iv,12);
763 ctx->Yi.c[15]=1;
764 ctr=1;
765 }
766 else {
767 size_t i;
768 u64 len0 = len;
769
770 while (len>=16) {
771 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
772 GCM_MUL(ctx,Yi);
773 iv += 16;
774 len -= 16;
775 }
776 if (len) {
777 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
778 GCM_MUL(ctx,Yi);
779 }
780 len0 <<= 3;
781 if (is_endian.little) {
782 #ifdef BSWAP8
783 ctx->Yi.u[1] ^= BSWAP8(len0);
784 #else
785 ctx->Yi.c[8] ^= (u8)(len0>>56);
786 ctx->Yi.c[9] ^= (u8)(len0>>48);
787 ctx->Yi.c[10] ^= (u8)(len0>>40);
788 ctx->Yi.c[11] ^= (u8)(len0>>32);
789 ctx->Yi.c[12] ^= (u8)(len0>>24);
790 ctx->Yi.c[13] ^= (u8)(len0>>16);
791 ctx->Yi.c[14] ^= (u8)(len0>>8);
792 ctx->Yi.c[15] ^= (u8)(len0);
793 #endif
794 }
795 else
796 ctx->Yi.u[1] ^= len0;
797
798 GCM_MUL(ctx,Yi);
799
800 if (is_endian.little)
801 ctr = GETU32(ctx->Yi.c+12);
802 else
803 ctr = ctx->Yi.d[3];
804 }
805
806 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
807 ++ctr;
808 if (is_endian.little){
809 PUTU32(ctx->Yi.c+12,ctr);
810 }
811 else
812 ctx->Yi.d[3] = ctr;
813 }
814
815 static int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
816 {
817 size_t i;
818 unsigned int n;
819 u64 alen = ctx->len.u[0];
820 #ifdef GCM_FUNCREF_4BIT
821 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
822 # ifdef GHASH
823 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
824 const u8 *inp,size_t len) = ctx->ghash;
825 # endif
826 #endif
827
828 if (ctx->len.u[1]) return -2;
829
830 alen += len;
831 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
832 return -1;
833 ctx->len.u[0] = alen;
834
835 n = ctx->ares;
836 if (n) {
837 while (n && len) {
838 ctx->Xi.c[n] ^= *(aad++);
839 --len;
840 n = (n+1)%16;
841 }
842 if (n==0) GCM_MUL(ctx,Xi);
843 else {
844 ctx->ares = n;
845 return 0;
846 }
847 }
848
849 #ifdef GHASH
850 if ((i = (len&(size_t)-16))) {
851 GHASH(ctx,aad,i);
852 aad += i;
853 len -= i;
854 }
855 #else
856 while (len>=16) {
857 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
858 GCM_MUL(ctx,Xi);
859 aad += 16;
860 len -= 16;
861 }
862 #endif
863 if (len) {
864 n = (unsigned int)len;
865 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
866 }
867
868 ctx->ares = n;
869 return 0;
870 }
871
872 static int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
873 const unsigned char *in, unsigned char *out,
874 size_t len)
875 {
876 const union { long one; char little; } is_endian = {1};
877 unsigned int n, ctr;
878 size_t i;
879 u64 mlen = ctx->len.u[1];
880 block128_f block = ctx->block;
881 void *key = ctx->key;
882 #ifdef GCM_FUNCREF_4BIT
883 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
884 # ifdef GHASH
885 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
886 const u8 *inp,size_t len) = ctx->ghash;
887 # endif
888 #endif
889
890 #if 0
891 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
892 #endif
893 mlen += len;
894 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
895 return -1;
896 ctx->len.u[1] = mlen;
897
898 if (ctx->ares) {
899 /* First call to encrypt finalizes GHASH(AAD) */
900 GCM_MUL(ctx,Xi);
901 ctx->ares = 0;
902 }
903
904 if (is_endian.little)
905 ctr = GETU32(ctx->Yi.c+12);
906 else
907 ctr = ctx->Yi.d[3];
908
909 n = ctx->mres;
910 #if !defined(OPENSSL_SMALL_FOOTPRINT)
911 if (16%sizeof(size_t) == 0) do { /* always true actually */
912 if (n) {
913 while (n && len) {
914 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
915 --len;
916 n = (n+1)%16;
917 }
918 if (n==0) GCM_MUL(ctx,Xi);
919 else {
920 ctx->mres = n;
921 return 0;
922 }
923 }
924 #if defined(STRICT_ALIGNMENT)
925 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
926 break;
927 #endif
928 #if defined(GHASH) && defined(GHASH_CHUNK)
929 while (len>=GHASH_CHUNK) {
930 size_t j=GHASH_CHUNK;
931
932 while (j) {
933 (*block)(ctx->Yi.c,ctx->EKi.c,key);
934 ++ctr;
935 if (is_endian.little){
936 PUTU32(ctx->Yi.c+12,ctr);
937 }
938 else
939 ctx->Yi.d[3] = ctr;
940 for (i=0; i<16; i+=sizeof(size_t))
941 *(size_t *)(out+i) =
942 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
943 out += 16;
944 in += 16;
945 j -= 16;
946 }
947 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
948 len -= GHASH_CHUNK;
949 }
950 if ((i = (len&(size_t)-16))) {
951 size_t j=i;
952
953 while (len>=16) {
954 (*block)(ctx->Yi.c,ctx->EKi.c,key);
955 ++ctr;
956 if (is_endian.little){
957 PUTU32(ctx->Yi.c+12,ctr);
958 }
959 else
960 ctx->Yi.d[3] = ctr;
961 for (i=0; i<16; i+=sizeof(size_t))
962 *(size_t *)(out+i) =
963 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
964 out += 16;
965 in += 16;
966 len -= 16;
967 }
968 GHASH(ctx,out-j,j);
969 }
970 #else
971 while (len>=16) {
972 (*block)(ctx->Yi.c,ctx->EKi.c,key);
973 ++ctr;
974 if (is_endian.little){
975 PUTU32(ctx->Yi.c+12,ctr);
976 }
977 else
978 ctx->Yi.d[3] = ctr;
979 for (i=0; i<16; i+=sizeof(size_t))
980 *(size_t *)(ctx->Xi.c+i) ^=
981 *(size_t *)(out+i) =
982 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
983 GCM_MUL(ctx,Xi);
984 out += 16;
985 in += 16;
986 len -= 16;
987 }
988 #endif
989 if (len) {
990 (*block)(ctx->Yi.c,ctx->EKi.c,key);
991 ++ctr;
992 if (is_endian.little){
993 PUTU32(ctx->Yi.c+12,ctr);
994 }
995 else
996 ctx->Yi.d[3] = ctr;
997 while (len--) {
998 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
999 ++n;
1000 }
1001 }
1002
1003 ctx->mres = n;
1004 return 0;
1005 } while(0);
1006 #endif
1007 for (i=0;i<len;++i) {
1008 if (n==0) {
1009 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1010 ++ctr;
1011 if (is_endian.little){
1012 PUTU32(ctx->Yi.c+12,ctr);
1013 }
1014 else
1015 ctx->Yi.d[3] = ctr;
1016 }
1017 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1018 n = (n+1)%16;
1019 if (n==0)
1020 GCM_MUL(ctx,Xi);
1021 }
1022
1023 ctx->mres = n;
1024 return 0;
1025 }
1026
1027 static int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1028 const unsigned char *in, unsigned char *out,
1029 size_t len)
1030 {
1031 const union { long one; char little; } is_endian = {1};
1032 unsigned int n, ctr;
1033 size_t i;
1034 u64 mlen = ctx->len.u[1];
1035 block128_f block = ctx->block;
1036 void *key = ctx->key;
1037 #ifdef GCM_FUNCREF_4BIT
1038 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1039 # ifdef GHASH
1040 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1041 const u8 *inp,size_t len) = ctx->ghash;
1042 # endif
1043 #endif
1044
1045 mlen += len;
1046 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1047 return -1;
1048 ctx->len.u[1] = mlen;
1049
1050 if (ctx->ares) {
1051 /* First call to decrypt finalizes GHASH(AAD) */
1052 GCM_MUL(ctx,Xi);
1053 ctx->ares = 0;
1054 }
1055
1056 if (is_endian.little)
1057 ctr = GETU32(ctx->Yi.c+12);
1058 else
1059 ctr = ctx->Yi.d[3];
1060
1061 n = ctx->mres;
1062 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1063 if (16%sizeof(size_t) == 0) do { /* always true actually */
1064 if (n) {
1065 while (n && len) {
1066 u8 c = *(in++);
1067 *(out++) = c^ctx->EKi.c[n];
1068 ctx->Xi.c[n] ^= c;
1069 --len;
1070 n = (n+1)%16;
1071 }
1072 if (n==0) GCM_MUL (ctx,Xi);
1073 else {
1074 ctx->mres = n;
1075 return 0;
1076 }
1077 }
1078 #if defined(STRICT_ALIGNMENT)
1079 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1080 break;
1081 #endif
1082 #if defined(GHASH) && defined(GHASH_CHUNK)
1083 while (len>=GHASH_CHUNK) {
1084 size_t j=GHASH_CHUNK;
1085
1086 GHASH(ctx,in,GHASH_CHUNK);
1087 while (j) {
1088 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1089 ++ctr;
1090 if (is_endian.little){
1091 PUTU32(ctx->Yi.c+12,ctr);
1092 }
1093 else
1094 ctx->Yi.d[3] = ctr;
1095 for (i=0; i<16; i+=sizeof(size_t))
1096 *(size_t *)(out+i) =
1097 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1098 out += 16;
1099 in += 16;
1100 j -= 16;
1101 }
1102 len -= GHASH_CHUNK;
1103 }
1104 if ((i = (len&(size_t)-16))) {
1105 GHASH(ctx,in,i);
1106 while (len>=16) {
1107 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1108 ++ctr;
1109 if (is_endian.little){
1110 PUTU32(ctx->Yi.c+12,ctr);
1111 }
1112 else
1113 ctx->Yi.d[3] = ctr;
1114 for (i=0; i<16; i+=sizeof(size_t))
1115 *(size_t *)(out+i) =
1116 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1117 out += 16;
1118 in += 16;
1119 len -= 16;
1120 }
1121 }
1122 #else
1123 while (len>=16) {
1124 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1125 ++ctr;
1126 if (is_endian.little){
1127 PUTU32(ctx->Yi.c+12,ctr);
1128 }
1129 else
1130 ctx->Yi.d[3] = ctr;
1131 for (i=0; i<16; i+=sizeof(size_t)) {
1132 size_t c = *(size_t *)(in+i);
1133 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1134 *(size_t *)(ctx->Xi.c+i) ^= c;
1135 }
1136 GCM_MUL(ctx,Xi);
1137 out += 16;
1138 in += 16;
1139 len -= 16;
1140 }
1141 #endif
1142 if (len) {
1143 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1144 ++ctr;
1145 if (is_endian.little){
1146 PUTU32(ctx->Yi.c+12,ctr);
1147 }
1148 else
1149 ctx->Yi.d[3] = ctr;
1150 while (len--) {
1151 u8 c = in[n];
1152 ctx->Xi.c[n] ^= c;
1153 out[n] = c^ctx->EKi.c[n];
1154 ++n;
1155 }
1156 }
1157
1158 ctx->mres = n;
1159 return 0;
1160 } while(0);
1161 #endif
1162 for (i=0;i<len;++i) {
1163 u8 c;
1164 if (n==0) {
1165 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1166 ++ctr;
1167 if (is_endian.little){
1168 PUTU32(ctx->Yi.c+12,ctr);
1169 }
1170 else
1171 ctx->Yi.d[3] = ctr;
1172 }
1173 c = in[i];
1174 out[i] = c^ctx->EKi.c[n];
1175 ctx->Xi.c[n] ^= c;
1176 n = (n+1)%16;
1177 if (n==0)
1178 GCM_MUL(ctx,Xi);
1179 }
1180
1181 ctx->mres = n;
1182 return 0;
1183 }
1184
1185 #if 0
1186 static int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1187 const unsigned char *in, unsigned char *out,
1188 size_t len, ctr128_f stream)
1189 {
1190 const union { long one; char little; } is_endian = {1};
1191 unsigned int n, ctr;
1192 size_t i;
1193 u64 mlen = ctx->len.u[1];
1194 void *key = ctx->key;
1195 #ifdef GCM_FUNCREF_4BIT
1196 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1197 # ifdef GHASH
1198 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1199 const u8 *inp,size_t len) = ctx->ghash;
1200 # endif
1201 #endif
1202
1203 mlen += len;
1204 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1205 return -1;
1206 ctx->len.u[1] = mlen;
1207
1208 if (ctx->ares) {
1209 /* First call to encrypt finalizes GHASH(AAD) */
1210 GCM_MUL(ctx,Xi);
1211 ctx->ares = 0;
1212 }
1213
1214 if (is_endian.little)
1215 ctr = GETU32(ctx->Yi.c+12);
1216 else
1217 ctr = ctx->Yi.d[3];
1218
1219 n = ctx->mres;
1220 if (n) {
1221 while (n && len) {
1222 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1223 --len;
1224 n = (n+1)%16;
1225 }
1226 if (n==0) GCM_MUL(ctx,Xi);
1227 else {
1228 ctx->mres = n;
1229 return 0;
1230 }
1231 }
1232 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1233 while (len>=GHASH_CHUNK) {
1234 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1235 ctr += GHASH_CHUNK/16;
1236 if (is_endian.little){
1237 PUTU32(ctx->Yi.c+12,ctr);
1238 }
1239 else
1240 ctx->Yi.d[3] = ctr;
1241 GHASH(ctx,out,GHASH_CHUNK);
1242 out += GHASH_CHUNK;
1243 in += GHASH_CHUNK;
1244 len -= GHASH_CHUNK;
1245 }
1246 #endif
1247 i = (len&(size_t)-16);
1248 if (i) {
1249 size_t j=i/16;
1250
1251 (*stream)(in,out,j,key,ctx->Yi.c);
1252 ctr += (unsigned int)j;
1253 if (is_endian.little){
1254 PUTU32(ctx->Yi.c+12,ctr);
1255 }
1256 else
1257 ctx->Yi.d[3] = ctr;
1258 in += i;
1259 len -= i;
1260 #if defined(GHASH)
1261 GHASH(ctx,out,i);
1262 out += i;
1263 #else
1264 while (j--) {
1265 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1266 GCM_MUL(ctx,Xi);
1267 out += 16;
1268 }
1269 #endif
1270 }
1271 if (len) {
1272 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1273 ++ctr;
1274 if (is_endian.little){
1275 PUTU32(ctx->Yi.c+12,ctr);
1276 }
1277 else
1278 ctx->Yi.d[3] = ctr;
1279 while (len--) {
1280 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1281 ++n;
1282 }
1283 }
1284
1285 ctx->mres = n;
1286 return 0;
1287 }
1288
1289 static int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1290 const unsigned char *in, unsigned char *out,
1291 size_t len,ctr128_f stream)
1292 {
1293 const union { long one; char little; } is_endian = {1};
1294 unsigned int n, ctr;
1295 size_t i;
1296 u64 mlen = ctx->len.u[1];
1297 void *key = ctx->key;
1298 #ifdef GCM_FUNCREF_4BIT
1299 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1300 # ifdef GHASH
1301 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1302 const u8 *inp,size_t len) = ctx->ghash;
1303 # endif
1304 #endif
1305
1306 mlen += len;
1307 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1308 return -1;
1309 ctx->len.u[1] = mlen;
1310
1311 if (ctx->ares) {
1312 /* First call to decrypt finalizes GHASH(AAD) */
1313 GCM_MUL(ctx,Xi);
1314 ctx->ares = 0;
1315 }
1316
1317 if (is_endian.little)
1318 ctr = GETU32(ctx->Yi.c+12);
1319 else
1320 ctr = ctx->Yi.d[3];
1321
1322 n = ctx->mres;
1323 if (n) {
1324 while (n && len) {
1325 u8 c = *(in++);
1326 *(out++) = c^ctx->EKi.c[n];
1327 ctx->Xi.c[n] ^= c;
1328 --len;
1329 n = (n+1)%16;
1330 }
1331 if (n==0) GCM_MUL (ctx,Xi);
1332 else {
1333 ctx->mres = n;
1334 return 0;
1335 }
1336 }
1337 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1338 while (len>=GHASH_CHUNK) {
1339 GHASH(ctx,in,GHASH_CHUNK);
1340 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1341 ctr += GHASH_CHUNK/16;
1342 if (is_endian.little){
1343 PUTU32(ctx->Yi.c+12,ctr);
1344 }
1345 else
1346 ctx->Yi.d[3] = ctr;
1347 out += GHASH_CHUNK;
1348 in += GHASH_CHUNK;
1349 len -= GHASH_CHUNK;
1350 }
1351 #endif
1352 i = (len&(size_t)-16);
1353 if (i) {
1354 size_t j=i/16;
1355
1356 #if defined(GHASH)
1357 GHASH(ctx,in,i);
1358 #else
1359 while (j--) {
1360 size_t k;
1361 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1362 GCM_MUL(ctx,Xi);
1363 in += 16;
1364 }
1365 j = i/16;
1366 in -= i;
1367 #endif
1368 (*stream)(in,out,j,key,ctx->Yi.c);
1369 ctr += (unsigned int)j;
1370 if (is_endian.little){
1371 PUTU32(ctx->Yi.c+12,ctr);
1372 }
1373 else
1374 ctx->Yi.d[3] = ctr;
1375 out += i;
1376 in += i;
1377 len -= i;
1378 }
1379 if (len) {
1380 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1381 ++ctr;
1382 if (is_endian.little){
1383 PUTU32(ctx->Yi.c+12,ctr);
1384 }
1385 else
1386 ctx->Yi.d[3] = ctr;
1387 while (len--) {
1388 u8 c = in[n];
1389 ctx->Xi.c[n] ^= c;
1390 out[n] = c^ctx->EKi.c[n];
1391 ++n;
1392 }
1393 }
1394
1395 ctx->mres = n;
1396 return 0;
1397 }
1398 #endif
1399 static int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1400 size_t len)
1401 {
1402 const union { long one; char little; } is_endian = {1};
1403 u64 alen = ctx->len.u[0]<<3;
1404 u64 clen = ctx->len.u[1]<<3;
1405 #ifdef GCM_FUNCREF_4BIT
1406 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1407 #endif
1408
1409 if (ctx->mres || ctx->ares)
1410 GCM_MUL(ctx,Xi);
1411
1412 if (is_endian.little) {
1413 #ifdef BSWAP8
1414 alen = BSWAP8(alen);
1415 clen = BSWAP8(clen);
1416 #else
1417 u8 *p = ctx->len.c;
1418
1419 ctx->len.u[0] = alen;
1420 ctx->len.u[1] = clen;
1421
1422 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1423 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1424 #endif
1425 }
1426
1427 ctx->Xi.u[0] ^= alen;
1428 ctx->Xi.u[1] ^= clen;
1429 GCM_MUL(ctx,Xi);
1430
1431 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1432 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1433
1434 if (tag && len<=sizeof(ctx->Xi))
1435 return memcmp(ctx->Xi.c,tag,len);
1436 else
1437 return -1;
1438 }
1439
1440 static void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1441 {
1442 CRYPTO_gcm128_finish(ctx, NULL, 0);
1443 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1444 }
1445 static int compare_string(unsigned char *a, unsigned char *b, unsigned int len)
1446 {
1447 unsigned int i;
1448
1449 if((a == NULL) || (b == NULL))
1450 return -1;
1451
1452 for (i = 0; i < len; i++){
1453 if(*a != *b)
1454 return -1;
1455 a++;
1456 b++;
1457 }
1458 return 0;
1459 }
1460 int rk_aes_gcm_encrypt(struct aes_ae_in *in, struct aes_ae_out *out, const int enc)
1461 {
1462 GCM128_CONTEXT ctx;
1463 unsigned char tmp[16];
1464 RK_AES_KEY ks;
1465 if(in == NULL || out== NULL)
1466 return -1;
1467
1468 if (in->key_len != 16 && in->key_len != 24 && in->key_len != 32)
1469 return -1;
1470
1471 if (in->tag_size > 16)
1472 return -1;
1473
1474 rk_aes_set_encrypt_key(in->key,in->key_len*8,&ks);
1475 CRYPTO_gcm128_init(&ctx,&ks,(block128_f)rk_aes_encrypt);
1476 CRYPTO_gcm128_setiv(&ctx,in->iv,in->iv_len);
1477 if (in->aad_len) CRYPTO_gcm128_aad(&ctx,in->aad,in->aad_len);
1478 if(enc){
1479 if (in->src_len) CRYPTO_gcm128_encrypt(&ctx,in->src,out->dest,in->src_len);
1480 CRYPTO_gcm128_tag(&ctx, out->tag, in->tag_size);
1481 return 0;
1482 }else{
1483 if (in->src_len) CRYPTO_gcm128_decrypt(&ctx,in->src,out->dest,in->src_len);
1484 CRYPTO_gcm128_tag(&ctx, tmp, in->tag_size);
1485 return compare_string(tmp, out->tag, in->tag_size);
1486 }
1487 }
1488
1489
1490