xref: /OK3568_Linux_fs/external/security/librkcrypto/test/c_mode/sm4_gcm.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include "sm4_core.h"
5 #include "sm4_locl.h"
6 
7 #define OPENSSL_FIPSAPI
8 #define TABLE_BITS 1
9 #include <string.h>
10 #define DEBUG(format,...) printf("[%s]:%d: "format"\n", __func__,__LINE__, ##__VA_ARGS__)
11 
12 #ifndef MODES_DEBUG
13 # ifndef NDEBUG
14 #  define NDEBUG
15 # endif
16 #endif
17 
18 #if defined(BSWAP4)
19 /* redefine, because alignment is ensured */
20 #undef	GETU32
21 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
22 #undef	PUTU32
23 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
24 #endif
25 
26 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
27 #define REDUCE1BIT(V)	do { \
28 	if (sizeof(size_t)==8) { \
29 		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
30 		V.lo  = (V.hi<<63)|(V.lo>>1); \
31 		V.hi  = (V.hi>>1 )^T; \
32 	} \
33 	else { \
34 		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
35 		V.lo  = (V.hi<<63)|(V.lo>>1); \
36 		V.hi  = (V.hi>>1 )^((u64)T<<32); \
37 	} \
38 } while(0)
39 
40 typedef struct { u64 hi,lo; } u128;
41 
42 typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
43 			unsigned int blocks, const void *key,
44 			const unsigned char ivec[16]);
45 
46 
47 struct gcm128_context {
48 	/* Following 6 names follow names in GCM specification */
49 	union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len,
50 						Xi,H;
51 	/* Relative position of Xi, H and pre-computed Htable is used
52 	 * in some assembler modules, i.e. don't change the order! */
53 #if TABLE_BITS==8
54 	u128 Htable[256];
55 #else
56 	u128 Htable[16];
57 	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
58 	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
59 #endif
60 	unsigned int mres, ares;
61 	block128_f block;
62 	void *key;
63 };
64 
65 typedef struct gcm128_context GCM128_CONTEXT;
66 
67 /*
68  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
69  * never be set to 8. 8 is effectively reserved for testing purposes.
70  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
71  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
72  * whole spectrum of possible table driven implementations. Why? In
73  * non-"Shoup's" case memory access pattern is segmented in such manner,
74  * that it's trivial to see that cache timing information can reveal
75  * fair portion of intermediate hash value. Given that ciphertext is
76  * always available to attacker, it's possible for him to attempt to
77  * deduce secret parameter H and if successful, tamper with messages
78  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
79  * not as trivial, but there is no reason to believe that it's resistant
80  * to cache-timing attack. And the thing about "8-bit" implementation is
81  * that it consumes 16 (sixteen) times more memory, 4KB per individual
82  * key + 1KB shared. Well, on pros side it should be twice as fast as
83  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
84  * was observed to run ~75% faster, closer to 100% for commercial
85  * compilers... Yet "4-bit" procedure is preferred, because it's
86  * believed to provide better security-performance balance and adequate
87  * all-round performance. "All-round" refers to things like:
88  *
89  * - shorter setup time effectively improves overall timing for
90  *   handling short messages;
91  * - larger table allocation can become unbearable because of VM
92  *   subsystem penalties (for example on Windows large enough free
93  *   results in VM working set trimming, meaning that consequent
94  *   malloc would immediately incur working set expansion);
95  * - larger table has larger cache footprint, which can affect
96  *   performance of other code paths (not necessarily even from same
97  *   thread in Hyper-Threading world);
98  *
99  * Value of 1 is not appropriate for performance reasons.
100  */
101 #if	TABLE_BITS==8
102 
gcm_init_8bit(u128 Htable[256],u64 H[2])103 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
104 {
105 	int  i, j;
106 	u128 V;
107 
108 	Htable[0].hi = 0;
109 	Htable[0].lo = 0;
110 	V.hi = H[0];
111 	V.lo = H[1];
112 
113 	for (Htable[128]=V, i=64; i>0; i>>=1) {
114 		REDUCE1BIT(V);
115 		Htable[i] = V;
116 	}
117 
118 	for (i=2; i<256; i<<=1) {
119 		u128 *Hi = Htable+i, H0 = *Hi;
120 		for (j=1; j<i; ++j) {
121 			Hi[j].hi = H0.hi^Htable[j].hi;
122 			Hi[j].lo = H0.lo^Htable[j].lo;
123 		}
124 	}
125 }
126 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])127 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
128 {
129 	u128 Z = { 0, 0};
130 	const u8 *xi = (const u8 *)Xi+15;
131 	size_t rem, n = *xi;
132 	const union { long one; char little; } is_endian = {1};
133 	static const size_t rem_8bit[256] = {
134 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
135 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
136 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
137 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
138 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
139 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
140 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
141 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
142 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
143 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
144 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
145 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
146 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
147 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
148 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
149 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
150 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
151 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
152 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
153 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
154 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
155 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
156 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
157 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
158 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
159 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
160 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
161 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
162 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
163 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
164 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
165 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
166 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
167 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
168 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
169 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
170 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
171 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
172 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
173 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
174 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
175 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
176 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
177 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
178 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
179 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
180 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
181 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
182 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
183 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
184 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
185 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
186 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
187 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
188 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
189 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
190 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
191 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
192 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
193 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
194 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
195 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
196 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
197 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
198 
199 	while (1) {
200 		Z.hi ^= Htable[n].hi;
201 		Z.lo ^= Htable[n].lo;
202 
203 		if ((u8 *)Xi==xi)	break;
204 
205 		n = *(--xi);
206 
207 		rem  = (size_t)Z.lo&0xff;
208 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
209 		Z.hi = (Z.hi>>8);
210 		if (sizeof(size_t)==8)
211 			Z.hi ^= rem_8bit[rem];
212 		else
213 			Z.hi ^= (u64)rem_8bit[rem]<<32;
214 	}
215 
216 	if (is_endian.little) {
217 #ifdef BSWAP8
218 		Xi[0] = BSWAP8(Z.hi);
219 		Xi[1] = BSWAP8(Z.lo);
220 #else
221 		u8 *p = (u8 *)Xi;
222 		u32 v;
223 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
224 		v = (u32)(Z.hi);	PUTU32(p+4,v);
225 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
226 		v = (u32)(Z.lo);	PUTU32(p+12,v);
227 #endif
228 	}
229 	else {
230 		Xi[0] = Z.hi;
231 		Xi[1] = Z.lo;
232 	}
233 }
234 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
235 
236 #elif	TABLE_BITS==4
237 
gcm_init_4bit(u128 Htable[16],u64 H[2])238 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
239 {
240 	u128 V;
241 #if defined(OPENSSL_SMALL_FOOTPRINT)
242 	int  i;
243 #endif
244 
245 	Htable[0].hi = 0;
246 	Htable[0].lo = 0;
247 	V.hi = H[0];
248 	V.lo = H[1];
249 
250 #if defined(OPENSSL_SMALL_FOOTPRINT)
251 	for (Htable[8]=V, i=4; i>0; i>>=1) {
252 		REDUCE1BIT(V);
253 		Htable[i] = V;
254 	}
255 
256 	for (i=2; i<16; i<<=1) {
257 		u128 *Hi = Htable+i;
258 		int   j;
259 		for (V=*Hi, j=1; j<i; ++j) {
260 			Hi[j].hi = V.hi^Htable[j].hi;
261 			Hi[j].lo = V.lo^Htable[j].lo;
262 		}
263 	}
264 #else
265 	Htable[8] = V;
266 	REDUCE1BIT(V);
267 	Htable[4] = V;
268 	REDUCE1BIT(V);
269 	Htable[2] = V;
270 	REDUCE1BIT(V);
271 	Htable[1] = V;
272 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
273 	V=Htable[4];
274 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
275 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
276 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
277 	V=Htable[8];
278 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
279 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
280 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
281 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
282 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
283 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
284 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
285 #endif
286 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
287 	/*
288 	 * ARM assembler expects specific dword order in Htable.
289 	 */
290 	{
291 	int j;
292 	const union { long one; char little; } is_endian = {1};
293 
294 	if (is_endian.little)
295 		for (j=0;j<16;++j) {
296 			V = Htable[j];
297 			Htable[j].hi = V.lo;
298 			Htable[j].lo = V.hi;
299 		}
300 	else
301 		for (j=0;j<16;++j) {
302 			V = Htable[j];
303 			Htable[j].hi = V.lo<<32|V.lo>>32;
304 			Htable[j].lo = V.hi<<32|V.hi>>32;
305 		}
306 	}
307 #endif
308 }
309 
310 #ifndef GHASH_ASM
311 static const size_t rem_4bit[16] = {
312 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
313 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
314 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
315 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
316 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])317 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
318 {
319 	u128 Z;
320 	int cnt = 15;
321 	size_t rem, nlo, nhi;
322 	const union { long one; char little; } is_endian = {1};
323 
324 	nlo  = ((const u8 *)Xi)[15];
325 	nhi  = nlo>>4;
326 	nlo &= 0xf;
327 
328 	Z.hi = Htable[nlo].hi;
329 	Z.lo = Htable[nlo].lo;
330 
331 	while (1) {
332 		rem  = (size_t)Z.lo&0xf;
333 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
334 		Z.hi = (Z.hi>>4);
335 		if (sizeof(size_t)==8)
336 			Z.hi ^= rem_4bit[rem];
337 		else
338 			Z.hi ^= (u64)rem_4bit[rem]<<32;
339 
340 		Z.hi ^= Htable[nhi].hi;
341 		Z.lo ^= Htable[nhi].lo;
342 
343 		if (--cnt<0)		break;
344 
345 		nlo  = ((const u8 *)Xi)[cnt];
346 		nhi  = nlo>>4;
347 		nlo &= 0xf;
348 
349 		rem  = (size_t)Z.lo&0xf;
350 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
351 		Z.hi = (Z.hi>>4);
352 		if (sizeof(size_t)==8)
353 			Z.hi ^= rem_4bit[rem];
354 		else
355 			Z.hi ^= (u64)rem_4bit[rem]<<32;
356 
357 		Z.hi ^= Htable[nlo].hi;
358 		Z.lo ^= Htable[nlo].lo;
359 	}
360 
361 	if (is_endian.little) {
362 #ifdef BSWAP8
363 		Xi[0] = BSWAP8(Z.hi);
364 		Xi[1] = BSWAP8(Z.lo);
365 #else
366 		u8 *p = (u8 *)Xi;
367 		u32 v;
368 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
369 		v = (u32)(Z.hi);	PUTU32(p+4,v);
370 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
371 		v = (u32)(Z.lo);	PUTU32(p+12,v);
372 #endif
373 	}
374 	else {
375 		Xi[0] = Z.hi;
376 		Xi[1] = Z.lo;
377 	}
378 }
379 
380 #if !defined(OPENSSL_SMALL_FOOTPRINT)
381 /*
382  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
383  * details... Compiler-generated code doesn't seem to give any
384  * performance improvement, at least not on x86[_64]. It's here
385  * mostly as reference and a placeholder for possible future
386  * non-trivial optimization[s]...
387  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)388 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
389 				const u8 *inp,size_t len)
390 {
391     u128 Z;
392     int cnt;
393     size_t rem, nlo, nhi;
394     const union { long one; char little; } is_endian = {1};
395 
396 #if 1
397     do {
398 	cnt  = 15;
399 	nlo  = ((const u8 *)Xi)[15];
400 	nlo ^= inp[15];
401 	nhi  = nlo>>4;
402 	nlo &= 0xf;
403 
404 	Z.hi = Htable[nlo].hi;
405 	Z.lo = Htable[nlo].lo;
406 
407 	while (1) {
408 		rem  = (size_t)Z.lo&0xf;
409 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
410 		Z.hi = (Z.hi>>4);
411 		if (sizeof(size_t)==8)
412 			Z.hi ^= rem_4bit[rem];
413 		else
414 			Z.hi ^= (u64)rem_4bit[rem]<<32;
415 
416 		Z.hi ^= Htable[nhi].hi;
417 		Z.lo ^= Htable[nhi].lo;
418 
419 		if (--cnt<0)		break;
420 
421 		nlo  = ((const u8 *)Xi)[cnt];
422 		nlo ^= inp[cnt];
423 		nhi  = nlo>>4;
424 		nlo &= 0xf;
425 
426 		rem  = (size_t)Z.lo&0xf;
427 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
428 		Z.hi = (Z.hi>>4);
429 		if (sizeof(size_t)==8)
430 			Z.hi ^= rem_4bit[rem];
431 		else
432 			Z.hi ^= (u64)rem_4bit[rem]<<32;
433 
434 		Z.hi ^= Htable[nlo].hi;
435 		Z.lo ^= Htable[nlo].lo;
436 	}
437 #else
438     /*
439      * Extra 256+16 bytes per-key plus 512 bytes shared tables
440      * [should] give ~50% improvement... One could have PACK()-ed
441      * the rem_8bit even here, but the priority is to minimize
442      * cache footprint...
443      */
444     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
445     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
446     static const unsigned short rem_8bit[256] = {
447 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
448 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
449 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
450 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
451 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
452 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
453 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
454 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
455 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
456 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
457 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
458 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
459 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
460 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
461 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
462 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
463 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
464 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
465 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
466 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
467 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
468 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
469 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
470 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
471 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
472 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
473 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
474 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
475 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
476 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
477 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
478 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
479     /*
480      * This pre-processing phase slows down procedure by approximately
481      * same time as it makes each loop spin faster. In other words
482      * single block performance is approximately same as straightforward
483      * "4-bit" implementation, and then it goes only faster...
484      */
485     for (cnt=0; cnt<16; ++cnt) {
486 	Z.hi = Htable[cnt].hi;
487 	Z.lo = Htable[cnt].lo;
488 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
489 	Hshr4[cnt].hi = (Z.hi>>4);
490 	Hshl4[cnt]    = (u8)(Z.lo<<4);
491     }
492 
493     do {
494 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
495 		nlo  = ((const u8 *)Xi)[cnt];
496 		nlo ^= inp[cnt];
497 		nhi  = nlo>>4;
498 		nlo &= 0xf;
499 
500 		Z.hi ^= Htable[nlo].hi;
501 		Z.lo ^= Htable[nlo].lo;
502 
503 		rem = (size_t)Z.lo&0xff;
504 
505 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
506 		Z.hi = (Z.hi>>8);
507 
508 		Z.hi ^= Hshr4[nhi].hi;
509 		Z.lo ^= Hshr4[nhi].lo;
510 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
511 	}
512 
513 	nlo  = ((const u8 *)Xi)[0];
514 	nlo ^= inp[0];
515 	nhi  = nlo>>4;
516 	nlo &= 0xf;
517 
518 	Z.hi ^= Htable[nlo].hi;
519 	Z.lo ^= Htable[nlo].lo;
520 
521 	rem = (size_t)Z.lo&0xf;
522 
523 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
524 	Z.hi = (Z.hi>>4);
525 
526 	Z.hi ^= Htable[nhi].hi;
527 	Z.lo ^= Htable[nhi].lo;
528 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
529 #endif
530 
531 	if (is_endian.little) {
532 #ifdef BSWAP8
533 		Xi[0] = BSWAP8(Z.hi);
534 		Xi[1] = BSWAP8(Z.lo);
535 #else
536 		u8 *p = (u8 *)Xi;
537 		u32 v;
538 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
539 		v = (u32)(Z.hi);	PUTU32(p+4,v);
540 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
541 		v = (u32)(Z.lo);	PUTU32(p+12,v);
542 #endif
543 	}
544 	else {
545 		Xi[0] = Z.hi;
546 		Xi[1] = Z.lo;
547 	}
548     } while (inp+=16, len-=16);
549 }
550 #endif
551 #else
552 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
553 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
554 #endif
555 
556 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
557 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
558 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
559 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
560  * trashing effect. In other words idea is to hash data while it's
561  * still in L1 cache after encryption pass... */
562 #define GHASH_CHUNK       (3*1024)
563 #endif
564 
565 #else	/* TABLE_BITS */
566 
567 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
568 {
569 	u128 V,Z = { 0,0 };
570 	long X;
571 	unsigned int  i,j;
572 	const long *xi = (const long *)Xi;
573 	const union { long one; char little; } is_endian = {1};
574 
575 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
576 	V.lo = H[1];
577 
578 	for (j=0; j<16/sizeof(long); ++j) {
579 		if (is_endian.little) {
580 			if (sizeof(long)==8) {
581 #ifdef BSWAP8
582 				X = (long)(BSWAP8(xi[j]));
583 #else
584 				const u8 *p = (const u8 *)(xi+j);
585 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
586 #endif
587 			}
588 			else {
589 				const u8 *p = (const u8 *)(xi+j);
590 				X = (long)GETU32(p);
591 			}
592 		}
593 		else
594 			X = xi[j];
595 
596 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
597 			u64 M = (u64)(X>>(8*sizeof(long)-1));
598 			Z.hi ^= V.hi&M;
599 			Z.lo ^= V.lo&M;
600 
601 			REDUCE1BIT(V);
602 		}
603 	}
604 
605 	if (is_endian.little) {
606 #ifdef BSWAP8
607 		Xi[0] = BSWAP8(Z.hi);
608 		Xi[1] = BSWAP8(Z.lo);
609 #else
610 		u8 *p = (u8 *)Xi;
611 		u32 v;
612 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
613 		v = (u32)(Z.hi);	PUTU32(p+4,v);
614 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
615 		v = (u32)(Z.lo);	PUTU32(p+12,v);
616 #endif
617 	}
618 	else {
619 		Xi[0] = Z.hi;
620 		Xi[1] = Z.lo;
621 	}
622 }
623 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
624 
625 #endif
626 
627 #if	TABLE_BITS==4 && defined(GHASH_ASM)
628 # if	!defined(I386_ONLY) && \
629 	(defined(__i386)	|| defined(__i386__)	|| \
630 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
631 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
632 #  define GHASH_ASM_X86_OR_64
633 #  define GCM_FUNCREF_4BIT
634 extern unsigned int OPENSSL_ia32cap_P[2];
635 
636 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
637 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
638 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
639 
640 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
641 #   define GHASH_ASM_X86
642 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
643 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
644 
645 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
646 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
647 #  endif
648 # elif defined(__arm__) || defined(__arm)
649 #  include "arm_arch.h"
650 #  if __ARM_ARCH__>=7
651 #   define GHASH_ASM_ARM
652 #   define GCM_FUNCREF_4BIT
653 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
654 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
655 #  endif
656 # endif
657 #endif
658 
659 #ifdef GCM_FUNCREF_4BIT
660 # undef  GCM_MUL
661 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
662 # ifdef GHASH
663 #  undef  GHASH
664 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
665 # endif
666 #endif
667 
668 static void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
669 {
670 	const union { long one; char little; } is_endian = {1};
671 
672 	memset(ctx,0,sizeof(*ctx));
673 	ctx->block = block;
674 	ctx->key   = key;
675 
676 	(*block)(ctx->H.c, ctx->H.c,key);
677 
678 	if (is_endian.little) {
679 		/* H is stored in host byte order */
680 #ifdef BSWAP8
681 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
682 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
683 #else
684 		u8 *p = ctx->H.c;
685 		u64 hi,lo;
686 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
687 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
688 		ctx->H.u[0] = hi;
689 		ctx->H.u[1] = lo;
690 #endif
691 	}
692 
693 #if	TABLE_BITS==8
694 	gcm_init_8bit(ctx->Htable,ctx->H.u);
695 #elif	TABLE_BITS==4
696 # if	defined(GHASH_ASM_X86_OR_64)
697 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
698 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
699 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
700 		gcm_init_clmul(ctx->Htable,ctx->H.u);
701 		ctx->gmult = gcm_gmult_clmul;
702 		ctx->ghash = gcm_ghash_clmul;
703 		return;
704 	}
705 #  endif
706 	gcm_init_4bit(ctx->Htable,ctx->H.u);
707 #  if	defined(GHASH_ASM_X86)			/* x86 only */
708 #   if	defined(OPENSSL_IA32_SSE2)
709 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
710 #   else
711 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
712 #   endif
713 		ctx->gmult = gcm_gmult_4bit_mmx;
714 		ctx->ghash = gcm_ghash_4bit_mmx;
715 	} else {
716 		ctx->gmult = gcm_gmult_4bit_x86;
717 		ctx->ghash = gcm_ghash_4bit_x86;
718 	}
719 #  else
720 	ctx->gmult = gcm_gmult_4bit;
721 	ctx->ghash = gcm_ghash_4bit;
722 #  endif
723 # elif	defined(GHASH_ASM_ARM)
724 	if (OPENSSL_armcap_P & ARMV7_NEON) {
725 		ctx->gmult = gcm_gmult_neon;
726 		ctx->ghash = gcm_ghash_neon;
727 	} else {
728 		gcm_init_4bit(ctx->Htable,ctx->H.u);
729 		ctx->gmult = gcm_gmult_4bit;
730 		ctx->ghash = gcm_ghash_4bit;
731 	}
732 # else
733 	gcm_init_4bit(ctx->Htable,ctx->H.u);
734 # endif
735 #endif
736 }
737 
738 static void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
739 {
740 	const union { long one; char little; } is_endian = {1};
741 	unsigned int ctr;
742 #ifdef GCM_FUNCREF_4BIT
743 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
744 #endif
745 
746 	ctx->Yi.u[0]  = 0;
747 	ctx->Yi.u[1]  = 0;
748 	ctx->Xi.u[0]  = 0;
749 	ctx->Xi.u[1]  = 0;
750 	ctx->len.u[0] = 0;	/* AAD length */
751 	ctx->len.u[1] = 0;	/* message length */
752 	ctx->ares = 0;
753 	ctx->mres = 0;
754 
755 	if (len==12) {
756 		memcpy(ctx->Yi.c,iv,12);
757 		ctx->Yi.c[15]=1;
758 		ctr=1;
759 	}
760 	else {
761 		size_t i;
762 		u64 len0 = len;
763 
764 		while (len>=16) {
765 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
766 			GCM_MUL(ctx,Yi);
767 			iv += 16;
768 			len -= 16;
769 		}
770 		if (len) {
771 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
772 			GCM_MUL(ctx,Yi);
773 		}
774 		len0 <<= 3;
775 		if (is_endian.little) {
776 #ifdef BSWAP8
777 			ctx->Yi.u[1]  ^= BSWAP8(len0);
778 #else
779 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
780 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
781 			ctx->Yi.c[10] ^= (u8)(len0>>40);
782 			ctx->Yi.c[11] ^= (u8)(len0>>32);
783 			ctx->Yi.c[12] ^= (u8)(len0>>24);
784 			ctx->Yi.c[13] ^= (u8)(len0>>16);
785 			ctx->Yi.c[14] ^= (u8)(len0>>8);
786 			ctx->Yi.c[15] ^= (u8)(len0);
787 #endif
788 		}
789 		else
790 			ctx->Yi.u[1]  ^= len0;
791 
792 		GCM_MUL(ctx,Yi);
793 
794 		if (is_endian.little)
795 			ctr = GETU32(ctx->Yi.c+12);
796 		else
797 			ctr = ctx->Yi.d[3];
798 	}
799 
800 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
801 	++ctr;
802 	if (is_endian.little){
803 		PUTU32(ctx->Yi.c+12,ctr);
804 	}
805 	else
806 		ctx->Yi.d[3] = ctr;
807 }
808 
809 static int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
810 {
811 	size_t i;
812 	unsigned int n;
813 	u64 alen = ctx->len.u[0];
814 #ifdef GCM_FUNCREF_4BIT
815 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
816 # ifdef GHASH
817 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
818 				const u8 *inp,size_t len)	= ctx->ghash;
819 # endif
820 #endif
821 
822 	if (ctx->len.u[1]) return -2;
823 
824 	alen += len;
825 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
826 		return -1;
827 	ctx->len.u[0] = alen;
828 
829 	n = ctx->ares;
830 	if (n) {
831 		while (n && len) {
832 			ctx->Xi.c[n] ^= *(aad++);
833 			--len;
834 			n = (n+1)%16;
835 		}
836 		if (n==0) GCM_MUL(ctx,Xi);
837 		else {
838 			ctx->ares = n;
839 			return 0;
840 		}
841 	}
842 
843 #ifdef GHASH
844 	if ((i = (len&(size_t)-16))) {
845 		GHASH(ctx,aad,i);
846 		aad += i;
847 		len -= i;
848 	}
849 #else
850 	while (len>=16) {
851 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
852 		GCM_MUL(ctx,Xi);
853 		aad += 16;
854 		len -= 16;
855 	}
856 #endif
857 	if (len) {
858 		n = (unsigned int)len;
859 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
860 	}
861 
862 	ctx->ares = n;
863 	return 0;
864 }
865 
866 static int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
867 		const unsigned char *in, unsigned char *out,
868 		size_t len)
869 {
870 	const union { long one; char little; } is_endian = {1};
871 	unsigned int n, ctr;
872 	size_t i;
873 	u64        mlen  = ctx->len.u[1];
874 	block128_f block = ctx->block;
875 	void      *key   = ctx->key;
876 #ifdef GCM_FUNCREF_4BIT
877 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
878 # ifdef GHASH
879 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
880 				const u8 *inp,size_t len)	= ctx->ghash;
881 # endif
882 #endif
883 
884 #if 0
885 	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
886 #endif
887 	mlen += len;
888 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
889 		return -1;
890 	ctx->len.u[1] = mlen;
891 
892 	if (ctx->ares) {
893 		/* First call to encrypt finalizes GHASH(AAD) */
894 		GCM_MUL(ctx,Xi);
895 		ctx->ares = 0;
896 	}
897 
898 	if (is_endian.little)
899 		ctr = GETU32(ctx->Yi.c+12);
900 	else
901 		ctr = ctx->Yi.d[3];
902 
903 	n = ctx->mres;
904 #if !defined(OPENSSL_SMALL_FOOTPRINT)
905 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
906 		if (n) {
907 			while (n && len) {
908 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
909 				--len;
910 				n = (n+1)%16;
911 			}
912 			if (n==0) GCM_MUL(ctx,Xi);
913 			else {
914 				ctx->mres = n;
915 				return 0;
916 			}
917 		}
918 #if defined(STRICT_ALIGNMENT)
919 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
920 			break;
921 #endif
922 #if defined(GHASH) && defined(GHASH_CHUNK)
923 		while (len>=GHASH_CHUNK) {
924 		    size_t j=GHASH_CHUNK;
925 
926 		    while (j) {
927 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
928 			++ctr;
929 			if (is_endian.little){
930 				PUTU32(ctx->Yi.c+12,ctr);
931 			}
932 			else
933 				ctx->Yi.d[3] = ctr;
934 			for (i=0; i<16; i+=sizeof(size_t))
935 				*(size_t *)(out+i) =
936 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
937 			out += 16;
938 			in  += 16;
939 			j   -= 16;
940 		    }
941 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
942 		    len -= GHASH_CHUNK;
943 		}
944 		if ((i = (len&(size_t)-16))) {
945 		    size_t j=i;
946 
947 		    while (len>=16) {
948 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
949 			++ctr;
950 			if (is_endian.little){
951 				PUTU32(ctx->Yi.c+12,ctr);
952 			}
953 			else
954 				ctx->Yi.d[3] = ctr;
955 			for (i=0; i<16; i+=sizeof(size_t))
956 				*(size_t *)(out+i) =
957 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
958 			out += 16;
959 			in  += 16;
960 			len -= 16;
961 		    }
962 		    GHASH(ctx,out-j,j);
963 		}
964 #else
965 		while (len>=16) {
966 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
967 			++ctr;
968 			if (is_endian.little){
969 				PUTU32(ctx->Yi.c+12,ctr);
970 			}
971 			else
972 				ctx->Yi.d[3] = ctr;
973 			for (i=0; i<16; i+=sizeof(size_t))
974 				*(size_t *)(ctx->Xi.c+i) ^=
975 				*(size_t *)(out+i) =
976 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
977 			GCM_MUL(ctx,Xi);
978 			out += 16;
979 			in  += 16;
980 			len -= 16;
981 		}
982 #endif
983 		if (len) {
984 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
985 			++ctr;
986 			if (is_endian.little){
987 				PUTU32(ctx->Yi.c+12,ctr);
988 			}
989 			else
990 				ctx->Yi.d[3] = ctr;
991 			while (len--) {
992 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
993 				++n;
994 			}
995 		}
996 
997 		ctx->mres = n;
998 		return 0;
999 	} while(0);
1000 #endif
1001 	for (i=0;i<len;++i) {
1002 		if (n==0) {
1003 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1004 			++ctr;
1005 			if (is_endian.little){
1006 				PUTU32(ctx->Yi.c+12,ctr);
1007 			}
1008 			else
1009 				ctx->Yi.d[3] = ctr;
1010 		}
1011 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1012 		n = (n+1)%16;
1013 		if (n==0)
1014 			GCM_MUL(ctx,Xi);
1015 	}
1016 
1017 	ctx->mres = n;
1018 	return 0;
1019 }
1020 
1021 static int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1022 		const unsigned char *in, unsigned char *out,
1023 		size_t len)
1024 {
1025 	const union { long one; char little; } is_endian = {1};
1026 	unsigned int n, ctr;
1027 	size_t i;
1028 	u64        mlen  = ctx->len.u[1];
1029 	block128_f block = ctx->block;
1030 	void      *key   = ctx->key;
1031 #ifdef GCM_FUNCREF_4BIT
1032 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1033 # ifdef GHASH
1034 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1035 				const u8 *inp,size_t len)	= ctx->ghash;
1036 # endif
1037 #endif
1038 
1039 	mlen += len;
1040 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1041 		return -1;
1042 	ctx->len.u[1] = mlen;
1043 
1044 	if (ctx->ares) {
1045 		/* First call to decrypt finalizes GHASH(AAD) */
1046 		GCM_MUL(ctx,Xi);
1047 		ctx->ares = 0;
1048 	}
1049 
1050 	if (is_endian.little)
1051 		ctr = GETU32(ctx->Yi.c+12);
1052 	else
1053 		ctr = ctx->Yi.d[3];
1054 
1055 	n = ctx->mres;
1056 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1057 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1058 		if (n) {
1059 			while (n && len) {
1060 				u8 c = *(in++);
1061 				*(out++) = c^ctx->EKi.c[n];
1062 				ctx->Xi.c[n] ^= c;
1063 				--len;
1064 				n = (n+1)%16;
1065 			}
1066 			if (n==0) GCM_MUL (ctx,Xi);
1067 			else {
1068 				ctx->mres = n;
1069 				return 0;
1070 			}
1071 		}
1072 #if defined(STRICT_ALIGNMENT)
1073 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1074 			break;
1075 #endif
1076 #if defined(GHASH) && defined(GHASH_CHUNK)
1077 		while (len>=GHASH_CHUNK) {
1078 		    size_t j=GHASH_CHUNK;
1079 
1080 		    GHASH(ctx,in,GHASH_CHUNK);
1081 		    while (j) {
1082 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1083 			++ctr;
1084 			if (is_endian.little){
1085 				PUTU32(ctx->Yi.c+12,ctr);
1086 			}
1087 			else
1088 				ctx->Yi.d[3] = ctr;
1089 			for (i=0; i<16; i+=sizeof(size_t))
1090 				*(size_t *)(out+i) =
1091 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1092 			out += 16;
1093 			in  += 16;
1094 			j   -= 16;
1095 		    }
1096 		    len -= GHASH_CHUNK;
1097 		}
1098 		if ((i = (len&(size_t)-16))) {
1099 		    GHASH(ctx,in,i);
1100 		    while (len>=16) {
1101 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1102 			++ctr;
1103 			if (is_endian.little){
1104 				PUTU32(ctx->Yi.c+12,ctr);
1105 			}
1106 			else
1107 				ctx->Yi.d[3] = ctr;
1108 			for (i=0; i<16; i+=sizeof(size_t))
1109 				*(size_t *)(out+i) =
1110 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1111 			out += 16;
1112 			in  += 16;
1113 			len -= 16;
1114 		    }
1115 		}
1116 #else
1117 		while (len>=16) {
1118 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1119 			++ctr;
1120 			if (is_endian.little){
1121 				PUTU32(ctx->Yi.c+12,ctr);
1122 			}
1123 			else
1124 				ctx->Yi.d[3] = ctr;
1125 			for (i=0; i<16; i+=sizeof(size_t)) {
1126 				size_t c = *(size_t *)(in+i);
1127 				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1128 				*(size_t *)(ctx->Xi.c+i) ^= c;
1129 			}
1130 			GCM_MUL(ctx,Xi);
1131 			out += 16;
1132 			in  += 16;
1133 			len -= 16;
1134 		}
1135 #endif
1136 		if (len) {
1137 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1138 			++ctr;
1139 			if (is_endian.little){
1140 				PUTU32(ctx->Yi.c+12,ctr);
1141 			}
1142 			else
1143 				ctx->Yi.d[3] = ctr;
1144 			while (len--) {
1145 				u8 c = in[n];
1146 				ctx->Xi.c[n] ^= c;
1147 				out[n] = c^ctx->EKi.c[n];
1148 				++n;
1149 			}
1150 		}
1151 
1152 		ctx->mres = n;
1153 		return 0;
1154 	} while(0);
1155 #endif
1156 	for (i=0;i<len;++i) {
1157 		u8 c;
1158 		if (n==0) {
1159 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1160 			++ctr;
1161 			if (is_endian.little){
1162 				PUTU32(ctx->Yi.c+12,ctr);
1163 			}
1164 			else
1165 				ctx->Yi.d[3] = ctr;
1166 		}
1167 		c = in[i];
1168 		out[i] = c^ctx->EKi.c[n];
1169 		ctx->Xi.c[n] ^= c;
1170 		n = (n+1)%16;
1171 		if (n==0)
1172 			GCM_MUL(ctx,Xi);
1173 	}
1174 
1175 	ctx->mres = n;
1176 	return 0;
1177 }
1178 
1179 static int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1180 		const unsigned char *in, unsigned char *out,
1181 		size_t len, ctr128_f stream)
1182 {
1183 	const union { long one; char little; } is_endian = {1};
1184 	unsigned int n, ctr;
1185 	size_t i;
1186 	u64   mlen = ctx->len.u[1];
1187 	void *key  = ctx->key;
1188 #ifdef GCM_FUNCREF_4BIT
1189 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1190 # ifdef GHASH
1191 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1192 				const u8 *inp,size_t len)	= ctx->ghash;
1193 # endif
1194 #endif
1195 
1196 	mlen += len;
1197 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1198 		return -1;
1199 	ctx->len.u[1] = mlen;
1200 
1201 	if (ctx->ares) {
1202 		/* First call to encrypt finalizes GHASH(AAD) */
1203 		GCM_MUL(ctx,Xi);
1204 		ctx->ares = 0;
1205 	}
1206 
1207 	if (is_endian.little)
1208 		ctr = GETU32(ctx->Yi.c+12);
1209 	else
1210 		ctr = ctx->Yi.d[3];
1211 
1212 	n = ctx->mres;
1213 	if (n) {
1214 		while (n && len) {
1215 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1216 			--len;
1217 			n = (n+1)%16;
1218 		}
1219 		if (n==0) GCM_MUL(ctx,Xi);
1220 		else {
1221 			ctx->mres = n;
1222 			return 0;
1223 		}
1224 	}
1225 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1226 	while (len>=GHASH_CHUNK) {
1227 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1228 		ctr += GHASH_CHUNK/16;
1229 		if (is_endian.little){
1230 			PUTU32(ctx->Yi.c+12,ctr);
1231 		}
1232 		else
1233 			ctx->Yi.d[3] = ctr;
1234 		GHASH(ctx,out,GHASH_CHUNK);
1235 		out += GHASH_CHUNK;
1236 		in  += GHASH_CHUNK;
1237 		len -= GHASH_CHUNK;
1238 	}
1239 #endif
1240 	i = (len&(size_t)-16);
1241 	if (i) {
1242 		size_t j=i/16;
1243 
1244 		(*stream)(in,out,j,key,ctx->Yi.c);
1245 		ctr += (unsigned int)j;
1246 		if (is_endian.little){
1247 			PUTU32(ctx->Yi.c+12,ctr);
1248 		}
1249 		else
1250 			ctx->Yi.d[3] = ctr;
1251 		in  += i;
1252 		len -= i;
1253 #if defined(GHASH)
1254 		GHASH(ctx,out,i);
1255 		out += i;
1256 #else
1257 		while (j--) {
1258 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1259 			GCM_MUL(ctx,Xi);
1260 			out += 16;
1261 		}
1262 #endif
1263 	}
1264 	if (len) {
1265 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1266 		++ctr;
1267 		if (is_endian.little){
1268 			PUTU32(ctx->Yi.c+12,ctr);
1269 		}
1270 		else
1271 			ctx->Yi.d[3] = ctr;
1272 		while (len--) {
1273 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1274 			++n;
1275 		}
1276 	}
1277 
1278 	ctx->mres = n;
1279 	return 0;
1280 }
1281 
1282 static int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1283 		const unsigned char *in, unsigned char *out,
1284 		size_t len,ctr128_f stream)
1285 {
1286 	const union { long one; char little; } is_endian = {1};
1287 	unsigned int n, ctr;
1288 	size_t i;
1289 	u64   mlen = ctx->len.u[1];
1290 	void *key  = ctx->key;
1291 #ifdef GCM_FUNCREF_4BIT
1292 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1293 # ifdef GHASH
1294 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1295 				const u8 *inp,size_t len)	= ctx->ghash;
1296 # endif
1297 #endif
1298 
1299 	mlen += len;
1300 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1301 		return -1;
1302 	ctx->len.u[1] = mlen;
1303 
1304 	if (ctx->ares) {
1305 		/* First call to decrypt finalizes GHASH(AAD) */
1306 		GCM_MUL(ctx,Xi);
1307 		ctx->ares = 0;
1308 	}
1309 
1310 	if (is_endian.little)
1311 		ctr = GETU32(ctx->Yi.c+12);
1312 	else
1313 		ctr = ctx->Yi.d[3];
1314 
1315 	n = ctx->mres;
1316 	if (n) {
1317 		while (n && len) {
1318 			u8 c = *(in++);
1319 			*(out++) = c^ctx->EKi.c[n];
1320 			ctx->Xi.c[n] ^= c;
1321 			--len;
1322 			n = (n+1)%16;
1323 		}
1324 		if (n==0) GCM_MUL (ctx,Xi);
1325 		else {
1326 			ctx->mres = n;
1327 			return 0;
1328 		}
1329 	}
1330 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1331 	while (len>=GHASH_CHUNK) {
1332 		GHASH(ctx,in,GHASH_CHUNK);
1333 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1334 		ctr += GHASH_CHUNK/16;
1335 		if (is_endian.little){
1336 			PUTU32(ctx->Yi.c+12,ctr);
1337 		}
1338 		else
1339 			ctx->Yi.d[3] = ctr;
1340 		out += GHASH_CHUNK;
1341 		in  += GHASH_CHUNK;
1342 		len -= GHASH_CHUNK;
1343 	}
1344 #endif
1345 	i = (len&(size_t)-16);
1346 	if (i) {
1347 		size_t j=i/16;
1348 
1349 #if defined(GHASH)
1350 		GHASH(ctx,in,i);
1351 #else
1352 		while (j--) {
1353 			size_t k;
1354 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1355 			GCM_MUL(ctx,Xi);
1356 			in += 16;
1357 		}
1358 		j   = i/16;
1359 		in -= i;
1360 #endif
1361 		(*stream)(in,out,j,key,ctx->Yi.c);
1362 		ctr += (unsigned int)j;
1363 		if (is_endian.little){
1364 			PUTU32(ctx->Yi.c+12,ctr);
1365 		}
1366 		else
1367 			ctx->Yi.d[3] = ctr;
1368 		out += i;
1369 		in  += i;
1370 		len -= i;
1371 	}
1372 	if (len) {
1373 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1374 		++ctr;
1375 		if (is_endian.little){
1376 			PUTU32(ctx->Yi.c+12,ctr);
1377 		}
1378 		else
1379 			ctx->Yi.d[3] = ctr;
1380 		while (len--) {
1381 			u8 c = in[n];
1382 			ctx->Xi.c[n] ^= c;
1383 			out[n] = c^ctx->EKi.c[n];
1384 			++n;
1385 		}
1386 	}
1387 
1388 	ctx->mres = n;
1389 	return 0;
1390 }
1391 
1392 static int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1393 			size_t len)
1394 {
1395 	const union { long one; char little; } is_endian = {1};
1396 	u64 alen = ctx->len.u[0]<<3;
1397 	u64 clen = ctx->len.u[1]<<3;
1398 #ifdef GCM_FUNCREF_4BIT
1399 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1400 #endif
1401 
1402 	if (ctx->mres || ctx->ares)
1403 		GCM_MUL(ctx,Xi);
1404 
1405 	if (is_endian.little) {
1406 #ifdef BSWAP8
1407 		alen = BSWAP8(alen);
1408 		clen = BSWAP8(clen);
1409 #else
1410 		u8 *p = ctx->len.c;
1411 
1412 		ctx->len.u[0] = alen;
1413 		ctx->len.u[1] = clen;
1414 
1415 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1416 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1417 #endif
1418 	}
1419 
1420 	ctx->Xi.u[0] ^= alen;
1421 	ctx->Xi.u[1] ^= clen;
1422 	GCM_MUL(ctx,Xi);
1423 
1424 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1425 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1426 
1427 	if (tag && len<=sizeof(ctx->Xi))
1428 		return memcmp(ctx->Xi.c,tag,len);
1429 	else
1430 		return -1;
1431 }
1432 
1433 static void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1434 {
1435 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1436 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1437 }
1438 
1439 int rk_sm4_gcm_encrypt(struct sm4_ae_in *in, struct sm4_ae_out *out, const int enc)
1440 {
1441 	GCM128_CONTEXT ctx;
1442 	sm4_context  sm4_ctx;
1443 
1444 	if(in == NULL || out== NULL)
1445 		return -1;
1446 
1447 	rk_sm4_setkey_enc(&sm4_ctx, in->key);
1448 	CRYPTO_gcm128_init(&ctx,&sm4_ctx,rk_rk_sm4_crypt_ecb);
1449 	CRYPTO_gcm128_setiv(&ctx,in->iv,in->iv_len);
1450 	if (in->aad_len) CRYPTO_gcm128_aad(&ctx,in->aad,in->aad_len);
1451 	if(enc){
1452 		if (in->src_len) CRYPTO_gcm128_encrypt(&ctx,in->src,out->dest,in->src_len);
1453 		CRYPTO_gcm128_tag(&ctx, out->tag, in->tag_size);
1454 		return 0;
1455 	}else{
1456 	   	if (in->src_len) CRYPTO_gcm128_decrypt(&ctx,in->src,out->dest,in->src_len);
1457 	   	CRYPTO_gcm128_tag(&ctx,out->tag,in->tag_size);
1458 	   	return 0;
1459 	}
1460 }
1461 
1462 
1463