xref: /OK3568_Linux_fs/external/security/librkcrypto/test/c_mode/aes_gcm.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <assert.h>
5 #include "aes_core.h"
6 #include "aes_locl.h"
7 
8 #define OPENSSL_FIPSAPI
9 #define TABLE_BITS 1
10 #include <string.h>
11 #define DEBUG(format,...) printf("[%s]:%d: "format"\n", __func__,__LINE__, ##__VA_ARGS__)
12 
13 #ifndef MODES_DEBUG
14 # ifndef NDEBUG
15 #  define NDEBUG
16 # endif
17 #endif
18 #include <assert.h>
19 
20 #if defined(BSWAP4)
21 /* redefine, because alignment is ensured */
22 #undef	GETU32
23 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
24 #undef	PUTU32
25 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
26 #endif
27 
28 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
29 #define REDUCE1BIT(V)	do { \
30 	if (sizeof(size_t)==8) { \
31 		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32 		V.lo  = (V.hi<<63)|(V.lo>>1); \
33 		V.hi  = (V.hi>>1 )^T; \
34 	} \
35 	else { \
36 		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37 		V.lo  = (V.hi<<63)|(V.lo>>1); \
38 		V.hi  = (V.hi>>1 )^((u64)T<<32); \
39 	} \
40 } while(0)
41 
42 typedef struct { u64 hi,lo; } u128;
43 
44 typedef void (*block128_f)(const unsigned char in[16],
45 			unsigned char out[16],
46 			const void *key);
47 
48 typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
49 			unsigned int blocks, const void *key,
50 			const unsigned char ivec[16]);
51 
52 
53 struct gcm128_context {
54 	/* Following 6 names follow names in GCM specification */
55 	union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len,
56 						Xi,H;
57 	/* Relative position of Xi, H and pre-computed Htable is used
58 	 * in some assembler modules, i.e. don't change the order! */
59 #if TABLE_BITS==8
60 	u128 Htable[256];
61 #else
62 	u128 Htable[16];
63 	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
64 	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
65 #endif
66 	unsigned int mres, ares;
67 	block128_f block;
68 	void *key;
69 };
70 
71 typedef struct gcm128_context GCM128_CONTEXT;
72 
73 /*
74  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
75  * never be set to 8. 8 is effectively reserved for testing purposes.
76  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
77  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
78  * whole spectrum of possible table driven implementations. Why? In
79  * non-"Shoup's" case memory access pattern is segmented in such manner,
80  * that it's trivial to see that cache timing information can reveal
81  * fair portion of intermediate hash value. Given that ciphertext is
82  * always available to attacker, it's possible for him to attempt to
83  * deduce secret parameter H and if successful, tamper with messages
84  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
85  * not as trivial, but there is no reason to believe that it's resistant
86  * to cache-timing attack. And the thing about "8-bit" implementation is
87  * that it consumes 16 (sixteen) times more memory, 4KB per individual
88  * key + 1KB shared. Well, on pros side it should be twice as fast as
89  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
90  * was observed to run ~75% faster, closer to 100% for commercial
91  * compilers... Yet "4-bit" procedure is preferred, because it's
92  * believed to provide better security-performance balance and adequate
93  * all-round performance. "All-round" refers to things like:
94  *
95  * - shorter setup time effectively improves overall timing for
96  *   handling short messages;
97  * - larger table allocation can become unbearable because of VM
98  *   subsystem penalties (for example on Windows large enough free
99  *   results in VM working set trimming, meaning that consequent
100  *   malloc would immediately incur working set expansion);
101  * - larger table has larger cache footprint, which can affect
102  *   performance of other code paths (not necessarily even from same
103  *   thread in Hyper-Threading world);
104  *
105  * Value of 1 is not appropriate for performance reasons.
106  */
107 #if	TABLE_BITS==8
108 
gcm_init_8bit(u128 Htable[256],u64 H[2])109 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
110 {
111 	int  i, j;
112 	u128 V;
113 
114 	Htable[0].hi = 0;
115 	Htable[0].lo = 0;
116 	V.hi = H[0];
117 	V.lo = H[1];
118 
119 	for (Htable[128]=V, i=64; i>0; i>>=1) {
120 		REDUCE1BIT(V);
121 		Htable[i] = V;
122 	}
123 
124 	for (i=2; i<256; i<<=1) {
125 		u128 *Hi = Htable+i, H0 = *Hi;
126 		for (j=1; j<i; ++j) {
127 			Hi[j].hi = H0.hi^Htable[j].hi;
128 			Hi[j].lo = H0.lo^Htable[j].lo;
129 		}
130 	}
131 }
132 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])133 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
134 {
135 	u128 Z = { 0, 0};
136 	const u8 *xi = (const u8 *)Xi+15;
137 	size_t rem, n = *xi;
138 	const union { long one; char little; } is_endian = {1};
139 	static const size_t rem_8bit[256] = {
140 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
141 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
142 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
143 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
144 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
145 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
146 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
147 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
148 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
149 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
150 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
151 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
152 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
153 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
154 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
155 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
156 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
157 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
158 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
159 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
160 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
161 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
162 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
163 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
164 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
165 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
166 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
167 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
168 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
169 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
170 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
171 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
172 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
173 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
174 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
175 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
176 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
177 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
178 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
179 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
180 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
181 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
182 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
183 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
184 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
185 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
186 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
187 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
188 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
189 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
190 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
191 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
192 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
193 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
194 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
195 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
196 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
197 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
198 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
199 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
200 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
201 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
202 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
203 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
204 
205 	while (1) {
206 		Z.hi ^= Htable[n].hi;
207 		Z.lo ^= Htable[n].lo;
208 
209 		if ((u8 *)Xi==xi)	break;
210 
211 		n = *(--xi);
212 
213 		rem  = (size_t)Z.lo&0xff;
214 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
215 		Z.hi = (Z.hi>>8);
216 		if (sizeof(size_t)==8)
217 			Z.hi ^= rem_8bit[rem];
218 		else
219 			Z.hi ^= (u64)rem_8bit[rem]<<32;
220 	}
221 
222 	if (is_endian.little) {
223 #ifdef BSWAP8
224 		Xi[0] = BSWAP8(Z.hi);
225 		Xi[1] = BSWAP8(Z.lo);
226 #else
227 		u8 *p = (u8 *)Xi;
228 		u32 v;
229 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
230 		v = (u32)(Z.hi);	PUTU32(p+4,v);
231 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
232 		v = (u32)(Z.lo);	PUTU32(p+12,v);
233 #endif
234 	}
235 	else {
236 		Xi[0] = Z.hi;
237 		Xi[1] = Z.lo;
238 	}
239 }
240 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
241 
242 #elif	TABLE_BITS==4
243 
gcm_init_4bit(u128 Htable[16],u64 H[2])244 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
245 {
246 	u128 V;
247 #if defined(OPENSSL_SMALL_FOOTPRINT)
248 	int  i;
249 #endif
250 
251 	Htable[0].hi = 0;
252 	Htable[0].lo = 0;
253 	V.hi = H[0];
254 	V.lo = H[1];
255 
256 #if defined(OPENSSL_SMALL_FOOTPRINT)
257 	for (Htable[8]=V, i=4; i>0; i>>=1) {
258 		REDUCE1BIT(V);
259 		Htable[i] = V;
260 	}
261 
262 	for (i=2; i<16; i<<=1) {
263 		u128 *Hi = Htable+i;
264 		int   j;
265 		for (V=*Hi, j=1; j<i; ++j) {
266 			Hi[j].hi = V.hi^Htable[j].hi;
267 			Hi[j].lo = V.lo^Htable[j].lo;
268 		}
269 	}
270 #else
271 	Htable[8] = V;
272 	REDUCE1BIT(V);
273 	Htable[4] = V;
274 	REDUCE1BIT(V);
275 	Htable[2] = V;
276 	REDUCE1BIT(V);
277 	Htable[1] = V;
278 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
279 	V=Htable[4];
280 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
281 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
282 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
283 	V=Htable[8];
284 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
285 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
286 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
287 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
288 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
289 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
290 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
291 #endif
292 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
293 	/*
294 	 * ARM assembler expects specific dword order in Htable.
295 	 */
296 	{
297 	int j;
298 	const union { long one; char little; } is_endian = {1};
299 
300 	if (is_endian.little)
301 		for (j=0;j<16;++j) {
302 			V = Htable[j];
303 			Htable[j].hi = V.lo;
304 			Htable[j].lo = V.hi;
305 		}
306 	else
307 		for (j=0;j<16;++j) {
308 			V = Htable[j];
309 			Htable[j].hi = V.lo<<32|V.lo>>32;
310 			Htable[j].lo = V.hi<<32|V.hi>>32;
311 		}
312 	}
313 #endif
314 }
315 
316 #ifndef GHASH_ASM
317 static const size_t rem_4bit[16] = {
318 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
319 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
320 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
321 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
322 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])323 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
324 {
325 	u128 Z;
326 	int cnt = 15;
327 	size_t rem, nlo, nhi;
328 	const union { long one; char little; } is_endian = {1};
329 
330 	nlo  = ((const u8 *)Xi)[15];
331 	nhi  = nlo>>4;
332 	nlo &= 0xf;
333 
334 	Z.hi = Htable[nlo].hi;
335 	Z.lo = Htable[nlo].lo;
336 
337 	while (1) {
338 		rem  = (size_t)Z.lo&0xf;
339 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
340 		Z.hi = (Z.hi>>4);
341 		if (sizeof(size_t)==8)
342 			Z.hi ^= rem_4bit[rem];
343 		else
344 			Z.hi ^= (u64)rem_4bit[rem]<<32;
345 
346 		Z.hi ^= Htable[nhi].hi;
347 		Z.lo ^= Htable[nhi].lo;
348 
349 		if (--cnt<0)		break;
350 
351 		nlo  = ((const u8 *)Xi)[cnt];
352 		nhi  = nlo>>4;
353 		nlo &= 0xf;
354 
355 		rem  = (size_t)Z.lo&0xf;
356 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
357 		Z.hi = (Z.hi>>4);
358 		if (sizeof(size_t)==8)
359 			Z.hi ^= rem_4bit[rem];
360 		else
361 			Z.hi ^= (u64)rem_4bit[rem]<<32;
362 
363 		Z.hi ^= Htable[nlo].hi;
364 		Z.lo ^= Htable[nlo].lo;
365 	}
366 
367 	if (is_endian.little) {
368 #ifdef BSWAP8
369 		Xi[0] = BSWAP8(Z.hi);
370 		Xi[1] = BSWAP8(Z.lo);
371 #else
372 		u8 *p = (u8 *)Xi;
373 		u32 v;
374 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
375 		v = (u32)(Z.hi);	PUTU32(p+4,v);
376 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
377 		v = (u32)(Z.lo);	PUTU32(p+12,v);
378 #endif
379 	}
380 	else {
381 		Xi[0] = Z.hi;
382 		Xi[1] = Z.lo;
383 	}
384 }
385 
386 #if !defined(OPENSSL_SMALL_FOOTPRINT)
387 /*
388  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
389  * details... Compiler-generated code doesn't seem to give any
390  * performance improvement, at least not on x86[_64]. It's here
391  * mostly as reference and a placeholder for possible future
392  * non-trivial optimization[s]...
393  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)394 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
395 				const u8 *inp,size_t len)
396 {
397     u128 Z;
398     int cnt;
399     size_t rem, nlo, nhi;
400     const union { long one; char little; } is_endian = {1};
401 
402 #if 1
403     do {
404 	cnt  = 15;
405 	nlo  = ((const u8 *)Xi)[15];
406 	nlo ^= inp[15];
407 	nhi  = nlo>>4;
408 	nlo &= 0xf;
409 
410 	Z.hi = Htable[nlo].hi;
411 	Z.lo = Htable[nlo].lo;
412 
413 	while (1) {
414 		rem  = (size_t)Z.lo&0xf;
415 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
416 		Z.hi = (Z.hi>>4);
417 		if (sizeof(size_t)==8)
418 			Z.hi ^= rem_4bit[rem];
419 		else
420 			Z.hi ^= (u64)rem_4bit[rem]<<32;
421 
422 		Z.hi ^= Htable[nhi].hi;
423 		Z.lo ^= Htable[nhi].lo;
424 
425 		if (--cnt<0)		break;
426 
427 		nlo  = ((const u8 *)Xi)[cnt];
428 		nlo ^= inp[cnt];
429 		nhi  = nlo>>4;
430 		nlo &= 0xf;
431 
432 		rem  = (size_t)Z.lo&0xf;
433 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
434 		Z.hi = (Z.hi>>4);
435 		if (sizeof(size_t)==8)
436 			Z.hi ^= rem_4bit[rem];
437 		else
438 			Z.hi ^= (u64)rem_4bit[rem]<<32;
439 
440 		Z.hi ^= Htable[nlo].hi;
441 		Z.lo ^= Htable[nlo].lo;
442 	}
443 #else
444     /*
445      * Extra 256+16 bytes per-key plus 512 bytes shared tables
446      * [should] give ~50% improvement... One could have PACK()-ed
447      * the rem_8bit even here, but the priority is to minimize
448      * cache footprint...
449      */
450     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
451     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
452     static const unsigned short rem_8bit[256] = {
453 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
454 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
455 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
456 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
457 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
458 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
459 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
460 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
461 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
462 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
463 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
464 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
465 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
466 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
467 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
468 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
469 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
470 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
471 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
472 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
473 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
474 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
475 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
476 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
477 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
478 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
479 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
480 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
481 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
482 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
483 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
484 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
485     /*
486      * This pre-processing phase slows down procedure by approximately
487      * same time as it makes each loop spin faster. In other words
488      * single block performance is approximately same as straightforward
489      * "4-bit" implementation, and then it goes only faster...
490      */
491     for (cnt=0; cnt<16; ++cnt) {
492 	Z.hi = Htable[cnt].hi;
493 	Z.lo = Htable[cnt].lo;
494 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
495 	Hshr4[cnt].hi = (Z.hi>>4);
496 	Hshl4[cnt]    = (u8)(Z.lo<<4);
497     }
498 
499     do {
500 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
501 		nlo  = ((const u8 *)Xi)[cnt];
502 		nlo ^= inp[cnt];
503 		nhi  = nlo>>4;
504 		nlo &= 0xf;
505 
506 		Z.hi ^= Htable[nlo].hi;
507 		Z.lo ^= Htable[nlo].lo;
508 
509 		rem = (size_t)Z.lo&0xff;
510 
511 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
512 		Z.hi = (Z.hi>>8);
513 
514 		Z.hi ^= Hshr4[nhi].hi;
515 		Z.lo ^= Hshr4[nhi].lo;
516 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
517 	}
518 
519 	nlo  = ((const u8 *)Xi)[0];
520 	nlo ^= inp[0];
521 	nhi  = nlo>>4;
522 	nlo &= 0xf;
523 
524 	Z.hi ^= Htable[nlo].hi;
525 	Z.lo ^= Htable[nlo].lo;
526 
527 	rem = (size_t)Z.lo&0xf;
528 
529 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
530 	Z.hi = (Z.hi>>4);
531 
532 	Z.hi ^= Htable[nhi].hi;
533 	Z.lo ^= Htable[nhi].lo;
534 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
535 #endif
536 
537 	if (is_endian.little) {
538 #ifdef BSWAP8
539 		Xi[0] = BSWAP8(Z.hi);
540 		Xi[1] = BSWAP8(Z.lo);
541 #else
542 		u8 *p = (u8 *)Xi;
543 		u32 v;
544 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
545 		v = (u32)(Z.hi);	PUTU32(p+4,v);
546 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
547 		v = (u32)(Z.lo);	PUTU32(p+12,v);
548 #endif
549 	}
550 	else {
551 		Xi[0] = Z.hi;
552 		Xi[1] = Z.lo;
553 	}
554     } while (inp+=16, len-=16);
555 }
556 #endif
557 #else
558 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
559 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
560 #endif
561 
562 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
563 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
564 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
565 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
566  * trashing effect. In other words idea is to hash data while it's
567  * still in L1 cache after encryption pass... */
568 #define GHASH_CHUNK       (3*1024)
569 #endif
570 
571 #else	/* TABLE_BITS */
572 
573 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
574 {
575 	u128 V,Z = { 0,0 };
576 	long X;
577 	unsigned int  i,j;
578 	const long *xi = (const long *)Xi;
579 	const union { long one; char little; } is_endian = {1};
580 
581 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
582 	V.lo = H[1];
583 
584 	for (j=0; j<16/sizeof(long); ++j) {
585 		if (is_endian.little) {
586 			if (sizeof(long)==8) {
587 #ifdef BSWAP8
588 				X = (long)(BSWAP8(xi[j]));
589 #else
590 				const u8 *p = (const u8 *)(xi+j);
591 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
592 #endif
593 			}
594 			else {
595 				const u8 *p = (const u8 *)(xi+j);
596 				X = (long)GETU32(p);
597 			}
598 		}
599 		else
600 			X = xi[j];
601 
602 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
603 			u64 M = (u64)(X>>(8*sizeof(long)-1));
604 			Z.hi ^= V.hi&M;
605 			Z.lo ^= V.lo&M;
606 
607 			REDUCE1BIT(V);
608 		}
609 	}
610 
611 	if (is_endian.little) {
612 #ifdef BSWAP8
613 		Xi[0] = BSWAP8(Z.hi);
614 		Xi[1] = BSWAP8(Z.lo);
615 #else
616 		u8 *p = (u8 *)Xi;
617 		u32 v;
618 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
619 		v = (u32)(Z.hi);	PUTU32(p+4,v);
620 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
621 		v = (u32)(Z.lo);	PUTU32(p+12,v);
622 #endif
623 	}
624 	else {
625 		Xi[0] = Z.hi;
626 		Xi[1] = Z.lo;
627 	}
628 }
629 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
630 
631 #endif
632 
633 #if	TABLE_BITS==4 && defined(GHASH_ASM)
634 # if	!defined(I386_ONLY) && \
635 	(defined(__i386)	|| defined(__i386__)	|| \
636 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
637 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
638 #  define GHASH_ASM_X86_OR_64
639 #  define GCM_FUNCREF_4BIT
640 extern unsigned int OPENSSL_ia32cap_P[2];
641 
642 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
643 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
644 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
645 
646 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
647 #   define GHASH_ASM_X86
648 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
649 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
650 
651 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
652 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
653 #  endif
654 # elif defined(__arm__) || defined(__arm)
655 #  include "arm_arch.h"
656 #  if __ARM_ARCH__>=7
657 #   define GHASH_ASM_ARM
658 #   define GCM_FUNCREF_4BIT
659 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
660 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 #  endif
662 # endif
663 #endif
664 
665 #ifdef GCM_FUNCREF_4BIT
666 # undef  GCM_MUL
667 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
668 # ifdef GHASH
669 #  undef  GHASH
670 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
671 # endif
672 #endif
673 
674 static void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
675 {
676 	const union { long one; char little; } is_endian = {1};
677 
678 	memset(ctx,0,sizeof(*ctx));
679 	ctx->block = block;
680 	ctx->key   = key;
681 
682 	(*block)(ctx->H.c, ctx->H.c,key);
683 
684 	if (is_endian.little) {
685 		/* H is stored in host byte order */
686 #ifdef BSWAP8
687 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
688 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
689 #else
690 		u8 *p = ctx->H.c;
691 		u64 hi,lo;
692 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
693 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
694 		ctx->H.u[0] = hi;
695 		ctx->H.u[1] = lo;
696 #endif
697 	}
698 
699 #if	TABLE_BITS==8
700 	gcm_init_8bit(ctx->Htable,ctx->H.u);
701 #elif	TABLE_BITS==4
702 # if	defined(GHASH_ASM_X86_OR_64)
703 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
704 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
705 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
706 		gcm_init_clmul(ctx->Htable,ctx->H.u);
707 		ctx->gmult = gcm_gmult_clmul;
708 		ctx->ghash = gcm_ghash_clmul;
709 		return;
710 	}
711 #  endif
712 	gcm_init_4bit(ctx->Htable,ctx->H.u);
713 #  if	defined(GHASH_ASM_X86)			/* x86 only */
714 #   if	defined(OPENSSL_IA32_SSE2)
715 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
716 #   else
717 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
718 #   endif
719 		ctx->gmult = gcm_gmult_4bit_mmx;
720 		ctx->ghash = gcm_ghash_4bit_mmx;
721 	} else {
722 		ctx->gmult = gcm_gmult_4bit_x86;
723 		ctx->ghash = gcm_ghash_4bit_x86;
724 	}
725 #  else
726 	ctx->gmult = gcm_gmult_4bit;
727 	ctx->ghash = gcm_ghash_4bit;
728 #  endif
729 # elif	defined(GHASH_ASM_ARM)
730 	if (OPENSSL_armcap_P & ARMV7_NEON) {
731 		ctx->gmult = gcm_gmult_neon;
732 		ctx->ghash = gcm_ghash_neon;
733 	} else {
734 		gcm_init_4bit(ctx->Htable,ctx->H.u);
735 		ctx->gmult = gcm_gmult_4bit;
736 		ctx->ghash = gcm_ghash_4bit;
737 	}
738 # else
739 	gcm_init_4bit(ctx->Htable,ctx->H.u);
740 # endif
741 #endif
742 }
743 
744 static void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
745 {
746 	const union { long one; char little; } is_endian = {1};
747 	unsigned int ctr;
748 #ifdef GCM_FUNCREF_4BIT
749 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
750 #endif
751 
752 	ctx->Yi.u[0]  = 0;
753 	ctx->Yi.u[1]  = 0;
754 	ctx->Xi.u[0]  = 0;
755 	ctx->Xi.u[1]  = 0;
756 	ctx->len.u[0] = 0;	/* AAD length */
757 	ctx->len.u[1] = 0;	/* message length */
758 	ctx->ares = 0;
759 	ctx->mres = 0;
760 
761 	if (len==12) {
762 		memcpy(ctx->Yi.c,iv,12);
763 		ctx->Yi.c[15]=1;
764 		ctr=1;
765 	}
766 	else {
767 		size_t i;
768 		u64 len0 = len;
769 
770 		while (len>=16) {
771 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
772 			GCM_MUL(ctx,Yi);
773 			iv += 16;
774 			len -= 16;
775 		}
776 		if (len) {
777 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
778 			GCM_MUL(ctx,Yi);
779 		}
780 		len0 <<= 3;
781 		if (is_endian.little) {
782 #ifdef BSWAP8
783 			ctx->Yi.u[1]  ^= BSWAP8(len0);
784 #else
785 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
786 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
787 			ctx->Yi.c[10] ^= (u8)(len0>>40);
788 			ctx->Yi.c[11] ^= (u8)(len0>>32);
789 			ctx->Yi.c[12] ^= (u8)(len0>>24);
790 			ctx->Yi.c[13] ^= (u8)(len0>>16);
791 			ctx->Yi.c[14] ^= (u8)(len0>>8);
792 			ctx->Yi.c[15] ^= (u8)(len0);
793 #endif
794 		}
795 		else
796 			ctx->Yi.u[1]  ^= len0;
797 
798 		GCM_MUL(ctx,Yi);
799 
800 		if (is_endian.little)
801 			ctr = GETU32(ctx->Yi.c+12);
802 		else
803 			ctr = ctx->Yi.d[3];
804 	}
805 
806 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
807 	++ctr;
808 	if (is_endian.little){
809 		PUTU32(ctx->Yi.c+12,ctr);
810 	}
811 	else
812 		ctx->Yi.d[3] = ctr;
813 }
814 
815 static int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
816 {
817 	size_t i;
818 	unsigned int n;
819 	u64 alen = ctx->len.u[0];
820 #ifdef GCM_FUNCREF_4BIT
821 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
822 # ifdef GHASH
823 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
824 				const u8 *inp,size_t len)	= ctx->ghash;
825 # endif
826 #endif
827 
828 	if (ctx->len.u[1]) return -2;
829 
830 	alen += len;
831 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
832 		return -1;
833 	ctx->len.u[0] = alen;
834 
835 	n = ctx->ares;
836 	if (n) {
837 		while (n && len) {
838 			ctx->Xi.c[n] ^= *(aad++);
839 			--len;
840 			n = (n+1)%16;
841 		}
842 		if (n==0) GCM_MUL(ctx,Xi);
843 		else {
844 			ctx->ares = n;
845 			return 0;
846 		}
847 	}
848 
849 #ifdef GHASH
850 	if ((i = (len&(size_t)-16))) {
851 		GHASH(ctx,aad,i);
852 		aad += i;
853 		len -= i;
854 	}
855 #else
856 	while (len>=16) {
857 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
858 		GCM_MUL(ctx,Xi);
859 		aad += 16;
860 		len -= 16;
861 	}
862 #endif
863 	if (len) {
864 		n = (unsigned int)len;
865 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
866 	}
867 
868 	ctx->ares = n;
869 	return 0;
870 }
871 
872 static int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
873 		const unsigned char *in, unsigned char *out,
874 		size_t len)
875 {
876 	const union { long one; char little; } is_endian = {1};
877 	unsigned int n, ctr;
878 	size_t i;
879 	u64        mlen  = ctx->len.u[1];
880 	block128_f block = ctx->block;
881 	void      *key   = ctx->key;
882 #ifdef GCM_FUNCREF_4BIT
883 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
884 # ifdef GHASH
885 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
886 				const u8 *inp,size_t len)	= ctx->ghash;
887 # endif
888 #endif
889 
890 #if 0
891 	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
892 #endif
893 	mlen += len;
894 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
895 		return -1;
896 	ctx->len.u[1] = mlen;
897 
898 	if (ctx->ares) {
899 		/* First call to encrypt finalizes GHASH(AAD) */
900 		GCM_MUL(ctx,Xi);
901 		ctx->ares = 0;
902 	}
903 
904 	if (is_endian.little)
905 		ctr = GETU32(ctx->Yi.c+12);
906 	else
907 		ctr = ctx->Yi.d[3];
908 
909 	n = ctx->mres;
910 #if !defined(OPENSSL_SMALL_FOOTPRINT)
911 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
912 		if (n) {
913 			while (n && len) {
914 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
915 				--len;
916 				n = (n+1)%16;
917 			}
918 			if (n==0) GCM_MUL(ctx,Xi);
919 			else {
920 				ctx->mres = n;
921 				return 0;
922 			}
923 		}
924 #if defined(STRICT_ALIGNMENT)
925 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
926 			break;
927 #endif
928 #if defined(GHASH) && defined(GHASH_CHUNK)
929 		while (len>=GHASH_CHUNK) {
930 		    size_t j=GHASH_CHUNK;
931 
932 		    while (j) {
933 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
934 			++ctr;
935 			if (is_endian.little){
936 				PUTU32(ctx->Yi.c+12,ctr);
937 			}
938 			else
939 				ctx->Yi.d[3] = ctr;
940 			for (i=0; i<16; i+=sizeof(size_t))
941 				*(size_t *)(out+i) =
942 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
943 			out += 16;
944 			in  += 16;
945 			j   -= 16;
946 		    }
947 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
948 		    len -= GHASH_CHUNK;
949 		}
950 		if ((i = (len&(size_t)-16))) {
951 		    size_t j=i;
952 
953 		    while (len>=16) {
954 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
955 			++ctr;
956 			if (is_endian.little){
957 				PUTU32(ctx->Yi.c+12,ctr);
958 			}
959 			else
960 				ctx->Yi.d[3] = ctr;
961 			for (i=0; i<16; i+=sizeof(size_t))
962 				*(size_t *)(out+i) =
963 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
964 			out += 16;
965 			in  += 16;
966 			len -= 16;
967 		    }
968 		    GHASH(ctx,out-j,j);
969 		}
970 #else
971 		while (len>=16) {
972 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
973 			++ctr;
974 			if (is_endian.little){
975 				PUTU32(ctx->Yi.c+12,ctr);
976 			}
977 			else
978 				ctx->Yi.d[3] = ctr;
979 			for (i=0; i<16; i+=sizeof(size_t))
980 				*(size_t *)(ctx->Xi.c+i) ^=
981 				*(size_t *)(out+i) =
982 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
983 			GCM_MUL(ctx,Xi);
984 			out += 16;
985 			in  += 16;
986 			len -= 16;
987 		}
988 #endif
989 		if (len) {
990 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
991 			++ctr;
992 			if (is_endian.little){
993 				PUTU32(ctx->Yi.c+12,ctr);
994 			}
995 			else
996 				ctx->Yi.d[3] = ctr;
997 			while (len--) {
998 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
999 				++n;
1000 			}
1001 		}
1002 
1003 		ctx->mres = n;
1004 		return 0;
1005 	} while(0);
1006 #endif
1007 	for (i=0;i<len;++i) {
1008 		if (n==0) {
1009 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1010 			++ctr;
1011 			if (is_endian.little){
1012 				PUTU32(ctx->Yi.c+12,ctr);
1013 			}
1014 			else
1015 				ctx->Yi.d[3] = ctr;
1016 		}
1017 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1018 		n = (n+1)%16;
1019 		if (n==0)
1020 			GCM_MUL(ctx,Xi);
1021 	}
1022 
1023 	ctx->mres = n;
1024 	return 0;
1025 }
1026 
1027 static int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1028 		const unsigned char *in, unsigned char *out,
1029 		size_t len)
1030 {
1031 	const union { long one; char little; } is_endian = {1};
1032 	unsigned int n, ctr;
1033 	size_t i;
1034 	u64        mlen  = ctx->len.u[1];
1035 	block128_f block = ctx->block;
1036 	void      *key   = ctx->key;
1037 #ifdef GCM_FUNCREF_4BIT
1038 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1039 # ifdef GHASH
1040 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1041 				const u8 *inp,size_t len)	= ctx->ghash;
1042 # endif
1043 #endif
1044 
1045 	mlen += len;
1046 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1047 		return -1;
1048 	ctx->len.u[1] = mlen;
1049 
1050 	if (ctx->ares) {
1051 		/* First call to decrypt finalizes GHASH(AAD) */
1052 		GCM_MUL(ctx,Xi);
1053 		ctx->ares = 0;
1054 	}
1055 
1056 	if (is_endian.little)
1057 		ctr = GETU32(ctx->Yi.c+12);
1058 	else
1059 		ctr = ctx->Yi.d[3];
1060 
1061 	n = ctx->mres;
1062 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1063 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1064 		if (n) {
1065 			while (n && len) {
1066 				u8 c = *(in++);
1067 				*(out++) = c^ctx->EKi.c[n];
1068 				ctx->Xi.c[n] ^= c;
1069 				--len;
1070 				n = (n+1)%16;
1071 			}
1072 			if (n==0) GCM_MUL (ctx,Xi);
1073 			else {
1074 				ctx->mres = n;
1075 				return 0;
1076 			}
1077 		}
1078 #if defined(STRICT_ALIGNMENT)
1079 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1080 			break;
1081 #endif
1082 #if defined(GHASH) && defined(GHASH_CHUNK)
1083 		while (len>=GHASH_CHUNK) {
1084 		    size_t j=GHASH_CHUNK;
1085 
1086 		    GHASH(ctx,in,GHASH_CHUNK);
1087 		    while (j) {
1088 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1089 			++ctr;
1090 			if (is_endian.little){
1091 				PUTU32(ctx->Yi.c+12,ctr);
1092 			}
1093 			else
1094 				ctx->Yi.d[3] = ctr;
1095 			for (i=0; i<16; i+=sizeof(size_t))
1096 				*(size_t *)(out+i) =
1097 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1098 			out += 16;
1099 			in  += 16;
1100 			j   -= 16;
1101 		    }
1102 		    len -= GHASH_CHUNK;
1103 		}
1104 		if ((i = (len&(size_t)-16))) {
1105 		    GHASH(ctx,in,i);
1106 		    while (len>=16) {
1107 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1108 			++ctr;
1109 			if (is_endian.little){
1110 				PUTU32(ctx->Yi.c+12,ctr);
1111 			}
1112 			else
1113 				ctx->Yi.d[3] = ctr;
1114 			for (i=0; i<16; i+=sizeof(size_t))
1115 				*(size_t *)(out+i) =
1116 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1117 			out += 16;
1118 			in  += 16;
1119 			len -= 16;
1120 		    }
1121 		}
1122 #else
1123 		while (len>=16) {
1124 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1125 			++ctr;
1126 			if (is_endian.little){
1127 				PUTU32(ctx->Yi.c+12,ctr);
1128 			}
1129 			else
1130 				ctx->Yi.d[3] = ctr;
1131 			for (i=0; i<16; i+=sizeof(size_t)) {
1132 				size_t c = *(size_t *)(in+i);
1133 				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1134 				*(size_t *)(ctx->Xi.c+i) ^= c;
1135 			}
1136 			GCM_MUL(ctx,Xi);
1137 			out += 16;
1138 			in  += 16;
1139 			len -= 16;
1140 		}
1141 #endif
1142 		if (len) {
1143 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1144 			++ctr;
1145 			if (is_endian.little){
1146 				PUTU32(ctx->Yi.c+12,ctr);
1147 			}
1148 			else
1149 				ctx->Yi.d[3] = ctr;
1150 			while (len--) {
1151 				u8 c = in[n];
1152 				ctx->Xi.c[n] ^= c;
1153 				out[n] = c^ctx->EKi.c[n];
1154 				++n;
1155 			}
1156 		}
1157 
1158 		ctx->mres = n;
1159 		return 0;
1160 	} while(0);
1161 #endif
1162 	for (i=0;i<len;++i) {
1163 		u8 c;
1164 		if (n==0) {
1165 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1166 			++ctr;
1167 			if (is_endian.little){
1168 				PUTU32(ctx->Yi.c+12,ctr);
1169 			}
1170 			else
1171 				ctx->Yi.d[3] = ctr;
1172 		}
1173 		c = in[i];
1174 		out[i] = c^ctx->EKi.c[n];
1175 		ctx->Xi.c[n] ^= c;
1176 		n = (n+1)%16;
1177 		if (n==0)
1178 			GCM_MUL(ctx,Xi);
1179 	}
1180 
1181 	ctx->mres = n;
1182 	return 0;
1183 }
1184 
1185 #if 0
1186 static int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1187 		const unsigned char *in, unsigned char *out,
1188 		size_t len, ctr128_f stream)
1189 {
1190 	const union { long one; char little; } is_endian = {1};
1191 	unsigned int n, ctr;
1192 	size_t i;
1193 	u64   mlen = ctx->len.u[1];
1194 	void *key  = ctx->key;
1195 #ifdef GCM_FUNCREF_4BIT
1196 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1197 # ifdef GHASH
1198 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1199 				const u8 *inp,size_t len)	= ctx->ghash;
1200 # endif
1201 #endif
1202 
1203 	mlen += len;
1204 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1205 		return -1;
1206 	ctx->len.u[1] = mlen;
1207 
1208 	if (ctx->ares) {
1209 		/* First call to encrypt finalizes GHASH(AAD) */
1210 		GCM_MUL(ctx,Xi);
1211 		ctx->ares = 0;
1212 	}
1213 
1214 	if (is_endian.little)
1215 		ctr = GETU32(ctx->Yi.c+12);
1216 	else
1217 		ctr = ctx->Yi.d[3];
1218 
1219 	n = ctx->mres;
1220 	if (n) {
1221 		while (n && len) {
1222 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1223 			--len;
1224 			n = (n+1)%16;
1225 		}
1226 		if (n==0) GCM_MUL(ctx,Xi);
1227 		else {
1228 			ctx->mres = n;
1229 			return 0;
1230 		}
1231 	}
1232 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1233 	while (len>=GHASH_CHUNK) {
1234 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1235 		ctr += GHASH_CHUNK/16;
1236 		if (is_endian.little){
1237 			PUTU32(ctx->Yi.c+12,ctr);
1238 		}
1239 		else
1240 			ctx->Yi.d[3] = ctr;
1241 		GHASH(ctx,out,GHASH_CHUNK);
1242 		out += GHASH_CHUNK;
1243 		in  += GHASH_CHUNK;
1244 		len -= GHASH_CHUNK;
1245 	}
1246 #endif
1247 	i = (len&(size_t)-16);
1248 	if (i) {
1249 		size_t j=i/16;
1250 
1251 		(*stream)(in,out,j,key,ctx->Yi.c);
1252 		ctr += (unsigned int)j;
1253 		if (is_endian.little){
1254 			PUTU32(ctx->Yi.c+12,ctr);
1255 		}
1256 		else
1257 			ctx->Yi.d[3] = ctr;
1258 		in  += i;
1259 		len -= i;
1260 #if defined(GHASH)
1261 		GHASH(ctx,out,i);
1262 		out += i;
1263 #else
1264 		while (j--) {
1265 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1266 			GCM_MUL(ctx,Xi);
1267 			out += 16;
1268 		}
1269 #endif
1270 	}
1271 	if (len) {
1272 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1273 		++ctr;
1274 		if (is_endian.little){
1275 			PUTU32(ctx->Yi.c+12,ctr);
1276 		}
1277 		else
1278 			ctx->Yi.d[3] = ctr;
1279 		while (len--) {
1280 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1281 			++n;
1282 		}
1283 	}
1284 
1285 	ctx->mres = n;
1286 	return 0;
1287 }
1288 
1289 static int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1290 		const unsigned char *in, unsigned char *out,
1291 		size_t len,ctr128_f stream)
1292 {
1293 	const union { long one; char little; } is_endian = {1};
1294 	unsigned int n, ctr;
1295 	size_t i;
1296 	u64   mlen = ctx->len.u[1];
1297 	void *key  = ctx->key;
1298 #ifdef GCM_FUNCREF_4BIT
1299 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1300 # ifdef GHASH
1301 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1302 				const u8 *inp,size_t len)	= ctx->ghash;
1303 # endif
1304 #endif
1305 
1306 	mlen += len;
1307 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1308 		return -1;
1309 	ctx->len.u[1] = mlen;
1310 
1311 	if (ctx->ares) {
1312 		/* First call to decrypt finalizes GHASH(AAD) */
1313 		GCM_MUL(ctx,Xi);
1314 		ctx->ares = 0;
1315 	}
1316 
1317 	if (is_endian.little)
1318 		ctr = GETU32(ctx->Yi.c+12);
1319 	else
1320 		ctr = ctx->Yi.d[3];
1321 
1322 	n = ctx->mres;
1323 	if (n) {
1324 		while (n && len) {
1325 			u8 c = *(in++);
1326 			*(out++) = c^ctx->EKi.c[n];
1327 			ctx->Xi.c[n] ^= c;
1328 			--len;
1329 			n = (n+1)%16;
1330 		}
1331 		if (n==0) GCM_MUL (ctx,Xi);
1332 		else {
1333 			ctx->mres = n;
1334 			return 0;
1335 		}
1336 	}
1337 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1338 	while (len>=GHASH_CHUNK) {
1339 		GHASH(ctx,in,GHASH_CHUNK);
1340 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1341 		ctr += GHASH_CHUNK/16;
1342 		if (is_endian.little){
1343 			PUTU32(ctx->Yi.c+12,ctr);
1344 		}
1345 		else
1346 			ctx->Yi.d[3] = ctr;
1347 		out += GHASH_CHUNK;
1348 		in  += GHASH_CHUNK;
1349 		len -= GHASH_CHUNK;
1350 	}
1351 #endif
1352 	i = (len&(size_t)-16);
1353 	if (i) {
1354 		size_t j=i/16;
1355 
1356 #if defined(GHASH)
1357 		GHASH(ctx,in,i);
1358 #else
1359 		while (j--) {
1360 			size_t k;
1361 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1362 			GCM_MUL(ctx,Xi);
1363 			in += 16;
1364 		}
1365 		j   = i/16;
1366 		in -= i;
1367 #endif
1368 		(*stream)(in,out,j,key,ctx->Yi.c);
1369 		ctr += (unsigned int)j;
1370 		if (is_endian.little){
1371 			PUTU32(ctx->Yi.c+12,ctr);
1372 		}
1373 		else
1374 			ctx->Yi.d[3] = ctr;
1375 		out += i;
1376 		in  += i;
1377 		len -= i;
1378 	}
1379 	if (len) {
1380 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1381 		++ctr;
1382 		if (is_endian.little){
1383 			PUTU32(ctx->Yi.c+12,ctr);
1384 		}
1385 		else
1386 			ctx->Yi.d[3] = ctr;
1387 		while (len--) {
1388 			u8 c = in[n];
1389 			ctx->Xi.c[n] ^= c;
1390 			out[n] = c^ctx->EKi.c[n];
1391 			++n;
1392 		}
1393 	}
1394 
1395 	ctx->mres = n;
1396 	return 0;
1397 }
1398 #endif
1399 static int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1400 			size_t len)
1401 {
1402 	const union { long one; char little; } is_endian = {1};
1403 	u64 alen = ctx->len.u[0]<<3;
1404 	u64 clen = ctx->len.u[1]<<3;
1405 #ifdef GCM_FUNCREF_4BIT
1406 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1407 #endif
1408 
1409 	if (ctx->mres || ctx->ares)
1410 		GCM_MUL(ctx,Xi);
1411 
1412 	if (is_endian.little) {
1413 #ifdef BSWAP8
1414 		alen = BSWAP8(alen);
1415 		clen = BSWAP8(clen);
1416 #else
1417 		u8 *p = ctx->len.c;
1418 
1419 		ctx->len.u[0] = alen;
1420 		ctx->len.u[1] = clen;
1421 
1422 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1423 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1424 #endif
1425 	}
1426 
1427 	ctx->Xi.u[0] ^= alen;
1428 	ctx->Xi.u[1] ^= clen;
1429 	GCM_MUL(ctx,Xi);
1430 
1431 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1432 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1433 
1434 	if (tag && len<=sizeof(ctx->Xi))
1435 		return memcmp(ctx->Xi.c,tag,len);
1436 	else
1437 		return -1;
1438 }
1439 
1440 static void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1441 {
1442 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1443 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1444 }
1445 static int compare_string(unsigned char *a, unsigned char *b, unsigned int len)
1446 {
1447 	unsigned int i;
1448 
1449 	if((a == NULL) || (b == NULL))
1450 		return -1;
1451 
1452 	for (i = 0; i < len; i++){
1453 		if(*a != *b)
1454 			return -1;
1455 		a++;
1456 		b++;
1457 	}
1458 	return 0;
1459 }
1460 int rk_aes_gcm_encrypt(struct aes_ae_in *in, struct aes_ae_out *out, const int enc)
1461 {
1462 	GCM128_CONTEXT ctx;
1463 	unsigned char tmp[16];
1464 	RK_AES_KEY ks;
1465 	if(in == NULL || out== NULL)
1466 		return -1;
1467 
1468 	if (in->key_len != 16 && in->key_len != 24 && in->key_len != 32)
1469 		return -1;
1470 
1471 	if (in->tag_size > 16)
1472 		return -1;
1473 
1474 	rk_aes_set_encrypt_key(in->key,in->key_len*8,&ks);
1475 	CRYPTO_gcm128_init(&ctx,&ks,(block128_f)rk_aes_encrypt);
1476 	CRYPTO_gcm128_setiv(&ctx,in->iv,in->iv_len);
1477 	if (in->aad_len) CRYPTO_gcm128_aad(&ctx,in->aad,in->aad_len);
1478 	if(enc){
1479 		if (in->src_len) CRYPTO_gcm128_encrypt(&ctx,in->src,out->dest,in->src_len);
1480 		CRYPTO_gcm128_tag(&ctx, out->tag, in->tag_size);
1481 		return 0;
1482 	}else{
1483 	   	if (in->src_len) CRYPTO_gcm128_decrypt(&ctx,in->src,out->dest,in->src_len);
1484 	   	CRYPTO_gcm128_tag(&ctx, tmp, in->tag_size);
1485 		return compare_string(tmp, out->tag, in->tag_size);
1486 	}
1487 }
1488 
1489 
1490