1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include "sm4_core.h"
5 #include "sm4_locl.h"
6
7 #define OPENSSL_FIPSAPI
8 #define TABLE_BITS 1
9 #include <string.h>
10 #define DEBUG(format,...) printf("[%s]:%d: "format"\n", __func__,__LINE__, ##__VA_ARGS__)
11
12 #ifndef MODES_DEBUG
13 # ifndef NDEBUG
14 # define NDEBUG
15 # endif
16 #endif
17
18 #if defined(BSWAP4)
19 /* redefine, because alignment is ensured */
20 #undef GETU32
21 #define GETU32(p) BSWAP4(*(const u32 *)(p))
22 #undef PUTU32
23 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
24 #endif
25
26 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
27 #define REDUCE1BIT(V) do { \
28 if (sizeof(size_t)==8) { \
29 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
30 V.lo = (V.hi<<63)|(V.lo>>1); \
31 V.hi = (V.hi>>1 )^T; \
32 } \
33 else { \
34 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
35 V.lo = (V.hi<<63)|(V.lo>>1); \
36 V.hi = (V.hi>>1 )^((u64)T<<32); \
37 } \
38 } while(0)
39
40 typedef struct { u64 hi,lo; } u128;
41
42 typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
43 unsigned int blocks, const void *key,
44 const unsigned char ivec[16]);
45
46
47 struct gcm128_context {
48 /* Following 6 names follow names in GCM specification */
49 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len,
50 Xi,H;
51 /* Relative position of Xi, H and pre-computed Htable is used
52 * in some assembler modules, i.e. don't change the order! */
53 #if TABLE_BITS==8
54 u128 Htable[256];
55 #else
56 u128 Htable[16];
57 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
58 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
59 #endif
60 unsigned int mres, ares;
61 block128_f block;
62 void *key;
63 };
64
65 typedef struct gcm128_context GCM128_CONTEXT;
66
67 /*
68 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
69 * never be set to 8. 8 is effectively reserved for testing purposes.
70 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
71 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
72 * whole spectrum of possible table driven implementations. Why? In
73 * non-"Shoup's" case memory access pattern is segmented in such manner,
74 * that it's trivial to see that cache timing information can reveal
75 * fair portion of intermediate hash value. Given that ciphertext is
76 * always available to attacker, it's possible for him to attempt to
77 * deduce secret parameter H and if successful, tamper with messages
78 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
79 * not as trivial, but there is no reason to believe that it's resistant
80 * to cache-timing attack. And the thing about "8-bit" implementation is
81 * that it consumes 16 (sixteen) times more memory, 4KB per individual
82 * key + 1KB shared. Well, on pros side it should be twice as fast as
83 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
84 * was observed to run ~75% faster, closer to 100% for commercial
85 * compilers... Yet "4-bit" procedure is preferred, because it's
86 * believed to provide better security-performance balance and adequate
87 * all-round performance. "All-round" refers to things like:
88 *
89 * - shorter setup time effectively improves overall timing for
90 * handling short messages;
91 * - larger table allocation can become unbearable because of VM
92 * subsystem penalties (for example on Windows large enough free
93 * results in VM working set trimming, meaning that consequent
94 * malloc would immediately incur working set expansion);
95 * - larger table has larger cache footprint, which can affect
96 * performance of other code paths (not necessarily even from same
97 * thread in Hyper-Threading world);
98 *
99 * Value of 1 is not appropriate for performance reasons.
100 */
101 #if TABLE_BITS==8
102
gcm_init_8bit(u128 Htable[256],u64 H[2])103 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
104 {
105 int i, j;
106 u128 V;
107
108 Htable[0].hi = 0;
109 Htable[0].lo = 0;
110 V.hi = H[0];
111 V.lo = H[1];
112
113 for (Htable[128]=V, i=64; i>0; i>>=1) {
114 REDUCE1BIT(V);
115 Htable[i] = V;
116 }
117
118 for (i=2; i<256; i<<=1) {
119 u128 *Hi = Htable+i, H0 = *Hi;
120 for (j=1; j<i; ++j) {
121 Hi[j].hi = H0.hi^Htable[j].hi;
122 Hi[j].lo = H0.lo^Htable[j].lo;
123 }
124 }
125 }
126
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])127 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
128 {
129 u128 Z = { 0, 0};
130 const u8 *xi = (const u8 *)Xi+15;
131 size_t rem, n = *xi;
132 const union { long one; char little; } is_endian = {1};
133 static const size_t rem_8bit[256] = {
134 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
135 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
136 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
137 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
138 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
139 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
140 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
141 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
142 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
143 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
144 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
145 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
146 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
147 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
148 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
149 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
150 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
151 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
152 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
153 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
154 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
155 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
156 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
157 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
158 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
159 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
160 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
161 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
162 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
163 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
164 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
165 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
166 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
167 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
168 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
169 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
170 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
171 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
172 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
173 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
174 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
175 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
176 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
177 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
178 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
179 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
180 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
181 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
182 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
183 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
184 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
185 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
186 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
187 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
188 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
189 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
190 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
191 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
192 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
193 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
194 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
195 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
196 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
197 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
198
199 while (1) {
200 Z.hi ^= Htable[n].hi;
201 Z.lo ^= Htable[n].lo;
202
203 if ((u8 *)Xi==xi) break;
204
205 n = *(--xi);
206
207 rem = (size_t)Z.lo&0xff;
208 Z.lo = (Z.hi<<56)|(Z.lo>>8);
209 Z.hi = (Z.hi>>8);
210 if (sizeof(size_t)==8)
211 Z.hi ^= rem_8bit[rem];
212 else
213 Z.hi ^= (u64)rem_8bit[rem]<<32;
214 }
215
216 if (is_endian.little) {
217 #ifdef BSWAP8
218 Xi[0] = BSWAP8(Z.hi);
219 Xi[1] = BSWAP8(Z.lo);
220 #else
221 u8 *p = (u8 *)Xi;
222 u32 v;
223 v = (u32)(Z.hi>>32); PUTU32(p,v);
224 v = (u32)(Z.hi); PUTU32(p+4,v);
225 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
226 v = (u32)(Z.lo); PUTU32(p+12,v);
227 #endif
228 }
229 else {
230 Xi[0] = Z.hi;
231 Xi[1] = Z.lo;
232 }
233 }
234 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
235
236 #elif TABLE_BITS==4
237
gcm_init_4bit(u128 Htable[16],u64 H[2])238 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
239 {
240 u128 V;
241 #if defined(OPENSSL_SMALL_FOOTPRINT)
242 int i;
243 #endif
244
245 Htable[0].hi = 0;
246 Htable[0].lo = 0;
247 V.hi = H[0];
248 V.lo = H[1];
249
250 #if defined(OPENSSL_SMALL_FOOTPRINT)
251 for (Htable[8]=V, i=4; i>0; i>>=1) {
252 REDUCE1BIT(V);
253 Htable[i] = V;
254 }
255
256 for (i=2; i<16; i<<=1) {
257 u128 *Hi = Htable+i;
258 int j;
259 for (V=*Hi, j=1; j<i; ++j) {
260 Hi[j].hi = V.hi^Htable[j].hi;
261 Hi[j].lo = V.lo^Htable[j].lo;
262 }
263 }
264 #else
265 Htable[8] = V;
266 REDUCE1BIT(V);
267 Htable[4] = V;
268 REDUCE1BIT(V);
269 Htable[2] = V;
270 REDUCE1BIT(V);
271 Htable[1] = V;
272 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
273 V=Htable[4];
274 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
275 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
276 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
277 V=Htable[8];
278 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
279 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
280 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
281 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
282 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
283 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
284 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
285 #endif
286 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
287 /*
288 * ARM assembler expects specific dword order in Htable.
289 */
290 {
291 int j;
292 const union { long one; char little; } is_endian = {1};
293
294 if (is_endian.little)
295 for (j=0;j<16;++j) {
296 V = Htable[j];
297 Htable[j].hi = V.lo;
298 Htable[j].lo = V.hi;
299 }
300 else
301 for (j=0;j<16;++j) {
302 V = Htable[j];
303 Htable[j].hi = V.lo<<32|V.lo>>32;
304 Htable[j].lo = V.hi<<32|V.hi>>32;
305 }
306 }
307 #endif
308 }
309
310 #ifndef GHASH_ASM
311 static const size_t rem_4bit[16] = {
312 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
313 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
314 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
315 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
316
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])317 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
318 {
319 u128 Z;
320 int cnt = 15;
321 size_t rem, nlo, nhi;
322 const union { long one; char little; } is_endian = {1};
323
324 nlo = ((const u8 *)Xi)[15];
325 nhi = nlo>>4;
326 nlo &= 0xf;
327
328 Z.hi = Htable[nlo].hi;
329 Z.lo = Htable[nlo].lo;
330
331 while (1) {
332 rem = (size_t)Z.lo&0xf;
333 Z.lo = (Z.hi<<60)|(Z.lo>>4);
334 Z.hi = (Z.hi>>4);
335 if (sizeof(size_t)==8)
336 Z.hi ^= rem_4bit[rem];
337 else
338 Z.hi ^= (u64)rem_4bit[rem]<<32;
339
340 Z.hi ^= Htable[nhi].hi;
341 Z.lo ^= Htable[nhi].lo;
342
343 if (--cnt<0) break;
344
345 nlo = ((const u8 *)Xi)[cnt];
346 nhi = nlo>>4;
347 nlo &= 0xf;
348
349 rem = (size_t)Z.lo&0xf;
350 Z.lo = (Z.hi<<60)|(Z.lo>>4);
351 Z.hi = (Z.hi>>4);
352 if (sizeof(size_t)==8)
353 Z.hi ^= rem_4bit[rem];
354 else
355 Z.hi ^= (u64)rem_4bit[rem]<<32;
356
357 Z.hi ^= Htable[nlo].hi;
358 Z.lo ^= Htable[nlo].lo;
359 }
360
361 if (is_endian.little) {
362 #ifdef BSWAP8
363 Xi[0] = BSWAP8(Z.hi);
364 Xi[1] = BSWAP8(Z.lo);
365 #else
366 u8 *p = (u8 *)Xi;
367 u32 v;
368 v = (u32)(Z.hi>>32); PUTU32(p,v);
369 v = (u32)(Z.hi); PUTU32(p+4,v);
370 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
371 v = (u32)(Z.lo); PUTU32(p+12,v);
372 #endif
373 }
374 else {
375 Xi[0] = Z.hi;
376 Xi[1] = Z.lo;
377 }
378 }
379
380 #if !defined(OPENSSL_SMALL_FOOTPRINT)
381 /*
382 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
383 * details... Compiler-generated code doesn't seem to give any
384 * performance improvement, at least not on x86[_64]. It's here
385 * mostly as reference and a placeholder for possible future
386 * non-trivial optimization[s]...
387 */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)388 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
389 const u8 *inp,size_t len)
390 {
391 u128 Z;
392 int cnt;
393 size_t rem, nlo, nhi;
394 const union { long one; char little; } is_endian = {1};
395
396 #if 1
397 do {
398 cnt = 15;
399 nlo = ((const u8 *)Xi)[15];
400 nlo ^= inp[15];
401 nhi = nlo>>4;
402 nlo &= 0xf;
403
404 Z.hi = Htable[nlo].hi;
405 Z.lo = Htable[nlo].lo;
406
407 while (1) {
408 rem = (size_t)Z.lo&0xf;
409 Z.lo = (Z.hi<<60)|(Z.lo>>4);
410 Z.hi = (Z.hi>>4);
411 if (sizeof(size_t)==8)
412 Z.hi ^= rem_4bit[rem];
413 else
414 Z.hi ^= (u64)rem_4bit[rem]<<32;
415
416 Z.hi ^= Htable[nhi].hi;
417 Z.lo ^= Htable[nhi].lo;
418
419 if (--cnt<0) break;
420
421 nlo = ((const u8 *)Xi)[cnt];
422 nlo ^= inp[cnt];
423 nhi = nlo>>4;
424 nlo &= 0xf;
425
426 rem = (size_t)Z.lo&0xf;
427 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428 Z.hi = (Z.hi>>4);
429 if (sizeof(size_t)==8)
430 Z.hi ^= rem_4bit[rem];
431 else
432 Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434 Z.hi ^= Htable[nlo].hi;
435 Z.lo ^= Htable[nlo].lo;
436 }
437 #else
438 /*
439 * Extra 256+16 bytes per-key plus 512 bytes shared tables
440 * [should] give ~50% improvement... One could have PACK()-ed
441 * the rem_8bit even here, but the priority is to minimize
442 * cache footprint...
443 */
444 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
445 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
446 static const unsigned short rem_8bit[256] = {
447 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
448 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
449 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
450 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
451 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
452 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
453 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
454 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
455 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
456 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
457 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
458 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
459 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
460 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
461 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
462 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
463 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
464 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
465 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
466 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
467 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
468 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
469 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
470 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
471 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
472 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
473 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
474 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
475 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
476 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
477 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
478 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
479 /*
480 * This pre-processing phase slows down procedure by approximately
481 * same time as it makes each loop spin faster. In other words
482 * single block performance is approximately same as straightforward
483 * "4-bit" implementation, and then it goes only faster...
484 */
485 for (cnt=0; cnt<16; ++cnt) {
486 Z.hi = Htable[cnt].hi;
487 Z.lo = Htable[cnt].lo;
488 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
489 Hshr4[cnt].hi = (Z.hi>>4);
490 Hshl4[cnt] = (u8)(Z.lo<<4);
491 }
492
493 do {
494 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
495 nlo = ((const u8 *)Xi)[cnt];
496 nlo ^= inp[cnt];
497 nhi = nlo>>4;
498 nlo &= 0xf;
499
500 Z.hi ^= Htable[nlo].hi;
501 Z.lo ^= Htable[nlo].lo;
502
503 rem = (size_t)Z.lo&0xff;
504
505 Z.lo = (Z.hi<<56)|(Z.lo>>8);
506 Z.hi = (Z.hi>>8);
507
508 Z.hi ^= Hshr4[nhi].hi;
509 Z.lo ^= Hshr4[nhi].lo;
510 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
511 }
512
513 nlo = ((const u8 *)Xi)[0];
514 nlo ^= inp[0];
515 nhi = nlo>>4;
516 nlo &= 0xf;
517
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
520
521 rem = (size_t)Z.lo&0xf;
522
523 Z.lo = (Z.hi<<60)|(Z.lo>>4);
524 Z.hi = (Z.hi>>4);
525
526 Z.hi ^= Htable[nhi].hi;
527 Z.lo ^= Htable[nhi].lo;
528 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
529 #endif
530
531 if (is_endian.little) {
532 #ifdef BSWAP8
533 Xi[0] = BSWAP8(Z.hi);
534 Xi[1] = BSWAP8(Z.lo);
535 #else
536 u8 *p = (u8 *)Xi;
537 u32 v;
538 v = (u32)(Z.hi>>32); PUTU32(p,v);
539 v = (u32)(Z.hi); PUTU32(p+4,v);
540 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
541 v = (u32)(Z.lo); PUTU32(p+12,v);
542 #endif
543 }
544 else {
545 Xi[0] = Z.hi;
546 Xi[1] = Z.lo;
547 }
548 } while (inp+=16, len-=16);
549 }
550 #endif
551 #else
552 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
553 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
554 #endif
555
556 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
557 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
558 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
559 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
560 * trashing effect. In other words idea is to hash data while it's
561 * still in L1 cache after encryption pass... */
562 #define GHASH_CHUNK (3*1024)
563 #endif
564
565 #else /* TABLE_BITS */
566
567 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
568 {
569 u128 V,Z = { 0,0 };
570 long X;
571 unsigned int i,j;
572 const long *xi = (const long *)Xi;
573 const union { long one; char little; } is_endian = {1};
574
575 V.hi = H[0]; /* H is in host byte order, no byte swapping */
576 V.lo = H[1];
577
578 for (j=0; j<16/sizeof(long); ++j) {
579 if (is_endian.little) {
580 if (sizeof(long)==8) {
581 #ifdef BSWAP8
582 X = (long)(BSWAP8(xi[j]));
583 #else
584 const u8 *p = (const u8 *)(xi+j);
585 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
586 #endif
587 }
588 else {
589 const u8 *p = (const u8 *)(xi+j);
590 X = (long)GETU32(p);
591 }
592 }
593 else
594 X = xi[j];
595
596 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
597 u64 M = (u64)(X>>(8*sizeof(long)-1));
598 Z.hi ^= V.hi&M;
599 Z.lo ^= V.lo&M;
600
601 REDUCE1BIT(V);
602 }
603 }
604
605 if (is_endian.little) {
606 #ifdef BSWAP8
607 Xi[0] = BSWAP8(Z.hi);
608 Xi[1] = BSWAP8(Z.lo);
609 #else
610 u8 *p = (u8 *)Xi;
611 u32 v;
612 v = (u32)(Z.hi>>32); PUTU32(p,v);
613 v = (u32)(Z.hi); PUTU32(p+4,v);
614 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
615 v = (u32)(Z.lo); PUTU32(p+12,v);
616 #endif
617 }
618 else {
619 Xi[0] = Z.hi;
620 Xi[1] = Z.lo;
621 }
622 }
623 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
624
625 #endif
626
627 #if TABLE_BITS==4 && defined(GHASH_ASM)
628 # if !defined(I386_ONLY) && \
629 (defined(__i386) || defined(__i386__) || \
630 defined(__x86_64) || defined(__x86_64__) || \
631 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
632 # define GHASH_ASM_X86_OR_64
633 # define GCM_FUNCREF_4BIT
634 extern unsigned int OPENSSL_ia32cap_P[2];
635
636 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
637 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
638 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
639
640 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
641 # define GHASH_ASM_X86
642 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
643 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
644
645 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
646 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
647 # endif
648 # elif defined(__arm__) || defined(__arm)
649 # include "arm_arch.h"
650 # if __ARM_ARCH__>=7
651 # define GHASH_ASM_ARM
652 # define GCM_FUNCREF_4BIT
653 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
654 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
655 # endif
656 # endif
657 #endif
658
659 #ifdef GCM_FUNCREF_4BIT
660 # undef GCM_MUL
661 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
662 # ifdef GHASH
663 # undef GHASH
664 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
665 # endif
666 #endif
667
668 static void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
669 {
670 const union { long one; char little; } is_endian = {1};
671
672 memset(ctx,0,sizeof(*ctx));
673 ctx->block = block;
674 ctx->key = key;
675
676 (*block)(ctx->H.c, ctx->H.c,key);
677
678 if (is_endian.little) {
679 /* H is stored in host byte order */
680 #ifdef BSWAP8
681 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
682 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
683 #else
684 u8 *p = ctx->H.c;
685 u64 hi,lo;
686 hi = (u64)GETU32(p) <<32|GETU32(p+4);
687 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
688 ctx->H.u[0] = hi;
689 ctx->H.u[1] = lo;
690 #endif
691 }
692
693 #if TABLE_BITS==8
694 gcm_init_8bit(ctx->Htable,ctx->H.u);
695 #elif TABLE_BITS==4
696 # if defined(GHASH_ASM_X86_OR_64)
697 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
698 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
699 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
700 gcm_init_clmul(ctx->Htable,ctx->H.u);
701 ctx->gmult = gcm_gmult_clmul;
702 ctx->ghash = gcm_ghash_clmul;
703 return;
704 }
705 # endif
706 gcm_init_4bit(ctx->Htable,ctx->H.u);
707 # if defined(GHASH_ASM_X86) /* x86 only */
708 # if defined(OPENSSL_IA32_SSE2)
709 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
710 # else
711 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
712 # endif
713 ctx->gmult = gcm_gmult_4bit_mmx;
714 ctx->ghash = gcm_ghash_4bit_mmx;
715 } else {
716 ctx->gmult = gcm_gmult_4bit_x86;
717 ctx->ghash = gcm_ghash_4bit_x86;
718 }
719 # else
720 ctx->gmult = gcm_gmult_4bit;
721 ctx->ghash = gcm_ghash_4bit;
722 # endif
723 # elif defined(GHASH_ASM_ARM)
724 if (OPENSSL_armcap_P & ARMV7_NEON) {
725 ctx->gmult = gcm_gmult_neon;
726 ctx->ghash = gcm_ghash_neon;
727 } else {
728 gcm_init_4bit(ctx->Htable,ctx->H.u);
729 ctx->gmult = gcm_gmult_4bit;
730 ctx->ghash = gcm_ghash_4bit;
731 }
732 # else
733 gcm_init_4bit(ctx->Htable,ctx->H.u);
734 # endif
735 #endif
736 }
737
738 static void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
739 {
740 const union { long one; char little; } is_endian = {1};
741 unsigned int ctr;
742 #ifdef GCM_FUNCREF_4BIT
743 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
744 #endif
745
746 ctx->Yi.u[0] = 0;
747 ctx->Yi.u[1] = 0;
748 ctx->Xi.u[0] = 0;
749 ctx->Xi.u[1] = 0;
750 ctx->len.u[0] = 0; /* AAD length */
751 ctx->len.u[1] = 0; /* message length */
752 ctx->ares = 0;
753 ctx->mres = 0;
754
755 if (len==12) {
756 memcpy(ctx->Yi.c,iv,12);
757 ctx->Yi.c[15]=1;
758 ctr=1;
759 }
760 else {
761 size_t i;
762 u64 len0 = len;
763
764 while (len>=16) {
765 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
766 GCM_MUL(ctx,Yi);
767 iv += 16;
768 len -= 16;
769 }
770 if (len) {
771 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
772 GCM_MUL(ctx,Yi);
773 }
774 len0 <<= 3;
775 if (is_endian.little) {
776 #ifdef BSWAP8
777 ctx->Yi.u[1] ^= BSWAP8(len0);
778 #else
779 ctx->Yi.c[8] ^= (u8)(len0>>56);
780 ctx->Yi.c[9] ^= (u8)(len0>>48);
781 ctx->Yi.c[10] ^= (u8)(len0>>40);
782 ctx->Yi.c[11] ^= (u8)(len0>>32);
783 ctx->Yi.c[12] ^= (u8)(len0>>24);
784 ctx->Yi.c[13] ^= (u8)(len0>>16);
785 ctx->Yi.c[14] ^= (u8)(len0>>8);
786 ctx->Yi.c[15] ^= (u8)(len0);
787 #endif
788 }
789 else
790 ctx->Yi.u[1] ^= len0;
791
792 GCM_MUL(ctx,Yi);
793
794 if (is_endian.little)
795 ctr = GETU32(ctx->Yi.c+12);
796 else
797 ctr = ctx->Yi.d[3];
798 }
799
800 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
801 ++ctr;
802 if (is_endian.little){
803 PUTU32(ctx->Yi.c+12,ctr);
804 }
805 else
806 ctx->Yi.d[3] = ctr;
807 }
808
809 static int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
810 {
811 size_t i;
812 unsigned int n;
813 u64 alen = ctx->len.u[0];
814 #ifdef GCM_FUNCREF_4BIT
815 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
816 # ifdef GHASH
817 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
818 const u8 *inp,size_t len) = ctx->ghash;
819 # endif
820 #endif
821
822 if (ctx->len.u[1]) return -2;
823
824 alen += len;
825 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
826 return -1;
827 ctx->len.u[0] = alen;
828
829 n = ctx->ares;
830 if (n) {
831 while (n && len) {
832 ctx->Xi.c[n] ^= *(aad++);
833 --len;
834 n = (n+1)%16;
835 }
836 if (n==0) GCM_MUL(ctx,Xi);
837 else {
838 ctx->ares = n;
839 return 0;
840 }
841 }
842
843 #ifdef GHASH
844 if ((i = (len&(size_t)-16))) {
845 GHASH(ctx,aad,i);
846 aad += i;
847 len -= i;
848 }
849 #else
850 while (len>=16) {
851 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
852 GCM_MUL(ctx,Xi);
853 aad += 16;
854 len -= 16;
855 }
856 #endif
857 if (len) {
858 n = (unsigned int)len;
859 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
860 }
861
862 ctx->ares = n;
863 return 0;
864 }
865
866 static int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
867 const unsigned char *in, unsigned char *out,
868 size_t len)
869 {
870 const union { long one; char little; } is_endian = {1};
871 unsigned int n, ctr;
872 size_t i;
873 u64 mlen = ctx->len.u[1];
874 block128_f block = ctx->block;
875 void *key = ctx->key;
876 #ifdef GCM_FUNCREF_4BIT
877 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
878 # ifdef GHASH
879 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
880 const u8 *inp,size_t len) = ctx->ghash;
881 # endif
882 #endif
883
884 #if 0
885 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
886 #endif
887 mlen += len;
888 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
889 return -1;
890 ctx->len.u[1] = mlen;
891
892 if (ctx->ares) {
893 /* First call to encrypt finalizes GHASH(AAD) */
894 GCM_MUL(ctx,Xi);
895 ctx->ares = 0;
896 }
897
898 if (is_endian.little)
899 ctr = GETU32(ctx->Yi.c+12);
900 else
901 ctr = ctx->Yi.d[3];
902
903 n = ctx->mres;
904 #if !defined(OPENSSL_SMALL_FOOTPRINT)
905 if (16%sizeof(size_t) == 0) do { /* always true actually */
906 if (n) {
907 while (n && len) {
908 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
909 --len;
910 n = (n+1)%16;
911 }
912 if (n==0) GCM_MUL(ctx,Xi);
913 else {
914 ctx->mres = n;
915 return 0;
916 }
917 }
918 #if defined(STRICT_ALIGNMENT)
919 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
920 break;
921 #endif
922 #if defined(GHASH) && defined(GHASH_CHUNK)
923 while (len>=GHASH_CHUNK) {
924 size_t j=GHASH_CHUNK;
925
926 while (j) {
927 (*block)(ctx->Yi.c,ctx->EKi.c,key);
928 ++ctr;
929 if (is_endian.little){
930 PUTU32(ctx->Yi.c+12,ctr);
931 }
932 else
933 ctx->Yi.d[3] = ctr;
934 for (i=0; i<16; i+=sizeof(size_t))
935 *(size_t *)(out+i) =
936 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
937 out += 16;
938 in += 16;
939 j -= 16;
940 }
941 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
942 len -= GHASH_CHUNK;
943 }
944 if ((i = (len&(size_t)-16))) {
945 size_t j=i;
946
947 while (len>=16) {
948 (*block)(ctx->Yi.c,ctx->EKi.c,key);
949 ++ctr;
950 if (is_endian.little){
951 PUTU32(ctx->Yi.c+12,ctr);
952 }
953 else
954 ctx->Yi.d[3] = ctr;
955 for (i=0; i<16; i+=sizeof(size_t))
956 *(size_t *)(out+i) =
957 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
958 out += 16;
959 in += 16;
960 len -= 16;
961 }
962 GHASH(ctx,out-j,j);
963 }
964 #else
965 while (len>=16) {
966 (*block)(ctx->Yi.c,ctx->EKi.c,key);
967 ++ctr;
968 if (is_endian.little){
969 PUTU32(ctx->Yi.c+12,ctr);
970 }
971 else
972 ctx->Yi.d[3] = ctr;
973 for (i=0; i<16; i+=sizeof(size_t))
974 *(size_t *)(ctx->Xi.c+i) ^=
975 *(size_t *)(out+i) =
976 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
977 GCM_MUL(ctx,Xi);
978 out += 16;
979 in += 16;
980 len -= 16;
981 }
982 #endif
983 if (len) {
984 (*block)(ctx->Yi.c,ctx->EKi.c,key);
985 ++ctr;
986 if (is_endian.little){
987 PUTU32(ctx->Yi.c+12,ctr);
988 }
989 else
990 ctx->Yi.d[3] = ctr;
991 while (len--) {
992 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
993 ++n;
994 }
995 }
996
997 ctx->mres = n;
998 return 0;
999 } while(0);
1000 #endif
1001 for (i=0;i<len;++i) {
1002 if (n==0) {
1003 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1004 ++ctr;
1005 if (is_endian.little){
1006 PUTU32(ctx->Yi.c+12,ctr);
1007 }
1008 else
1009 ctx->Yi.d[3] = ctr;
1010 }
1011 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1012 n = (n+1)%16;
1013 if (n==0)
1014 GCM_MUL(ctx,Xi);
1015 }
1016
1017 ctx->mres = n;
1018 return 0;
1019 }
1020
1021 static int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1022 const unsigned char *in, unsigned char *out,
1023 size_t len)
1024 {
1025 const union { long one; char little; } is_endian = {1};
1026 unsigned int n, ctr;
1027 size_t i;
1028 u64 mlen = ctx->len.u[1];
1029 block128_f block = ctx->block;
1030 void *key = ctx->key;
1031 #ifdef GCM_FUNCREF_4BIT
1032 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1033 # ifdef GHASH
1034 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1035 const u8 *inp,size_t len) = ctx->ghash;
1036 # endif
1037 #endif
1038
1039 mlen += len;
1040 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1041 return -1;
1042 ctx->len.u[1] = mlen;
1043
1044 if (ctx->ares) {
1045 /* First call to decrypt finalizes GHASH(AAD) */
1046 GCM_MUL(ctx,Xi);
1047 ctx->ares = 0;
1048 }
1049
1050 if (is_endian.little)
1051 ctr = GETU32(ctx->Yi.c+12);
1052 else
1053 ctr = ctx->Yi.d[3];
1054
1055 n = ctx->mres;
1056 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1057 if (16%sizeof(size_t) == 0) do { /* always true actually */
1058 if (n) {
1059 while (n && len) {
1060 u8 c = *(in++);
1061 *(out++) = c^ctx->EKi.c[n];
1062 ctx->Xi.c[n] ^= c;
1063 --len;
1064 n = (n+1)%16;
1065 }
1066 if (n==0) GCM_MUL (ctx,Xi);
1067 else {
1068 ctx->mres = n;
1069 return 0;
1070 }
1071 }
1072 #if defined(STRICT_ALIGNMENT)
1073 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1074 break;
1075 #endif
1076 #if defined(GHASH) && defined(GHASH_CHUNK)
1077 while (len>=GHASH_CHUNK) {
1078 size_t j=GHASH_CHUNK;
1079
1080 GHASH(ctx,in,GHASH_CHUNK);
1081 while (j) {
1082 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1083 ++ctr;
1084 if (is_endian.little){
1085 PUTU32(ctx->Yi.c+12,ctr);
1086 }
1087 else
1088 ctx->Yi.d[3] = ctr;
1089 for (i=0; i<16; i+=sizeof(size_t))
1090 *(size_t *)(out+i) =
1091 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1092 out += 16;
1093 in += 16;
1094 j -= 16;
1095 }
1096 len -= GHASH_CHUNK;
1097 }
1098 if ((i = (len&(size_t)-16))) {
1099 GHASH(ctx,in,i);
1100 while (len>=16) {
1101 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1102 ++ctr;
1103 if (is_endian.little){
1104 PUTU32(ctx->Yi.c+12,ctr);
1105 }
1106 else
1107 ctx->Yi.d[3] = ctr;
1108 for (i=0; i<16; i+=sizeof(size_t))
1109 *(size_t *)(out+i) =
1110 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1111 out += 16;
1112 in += 16;
1113 len -= 16;
1114 }
1115 }
1116 #else
1117 while (len>=16) {
1118 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1119 ++ctr;
1120 if (is_endian.little){
1121 PUTU32(ctx->Yi.c+12,ctr);
1122 }
1123 else
1124 ctx->Yi.d[3] = ctr;
1125 for (i=0; i<16; i+=sizeof(size_t)) {
1126 size_t c = *(size_t *)(in+i);
1127 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1128 *(size_t *)(ctx->Xi.c+i) ^= c;
1129 }
1130 GCM_MUL(ctx,Xi);
1131 out += 16;
1132 in += 16;
1133 len -= 16;
1134 }
1135 #endif
1136 if (len) {
1137 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1138 ++ctr;
1139 if (is_endian.little){
1140 PUTU32(ctx->Yi.c+12,ctr);
1141 }
1142 else
1143 ctx->Yi.d[3] = ctr;
1144 while (len--) {
1145 u8 c = in[n];
1146 ctx->Xi.c[n] ^= c;
1147 out[n] = c^ctx->EKi.c[n];
1148 ++n;
1149 }
1150 }
1151
1152 ctx->mres = n;
1153 return 0;
1154 } while(0);
1155 #endif
1156 for (i=0;i<len;++i) {
1157 u8 c;
1158 if (n==0) {
1159 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1160 ++ctr;
1161 if (is_endian.little){
1162 PUTU32(ctx->Yi.c+12,ctr);
1163 }
1164 else
1165 ctx->Yi.d[3] = ctr;
1166 }
1167 c = in[i];
1168 out[i] = c^ctx->EKi.c[n];
1169 ctx->Xi.c[n] ^= c;
1170 n = (n+1)%16;
1171 if (n==0)
1172 GCM_MUL(ctx,Xi);
1173 }
1174
1175 ctx->mres = n;
1176 return 0;
1177 }
1178
1179 static int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1180 const unsigned char *in, unsigned char *out,
1181 size_t len, ctr128_f stream)
1182 {
1183 const union { long one; char little; } is_endian = {1};
1184 unsigned int n, ctr;
1185 size_t i;
1186 u64 mlen = ctx->len.u[1];
1187 void *key = ctx->key;
1188 #ifdef GCM_FUNCREF_4BIT
1189 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1190 # ifdef GHASH
1191 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1192 const u8 *inp,size_t len) = ctx->ghash;
1193 # endif
1194 #endif
1195
1196 mlen += len;
1197 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1198 return -1;
1199 ctx->len.u[1] = mlen;
1200
1201 if (ctx->ares) {
1202 /* First call to encrypt finalizes GHASH(AAD) */
1203 GCM_MUL(ctx,Xi);
1204 ctx->ares = 0;
1205 }
1206
1207 if (is_endian.little)
1208 ctr = GETU32(ctx->Yi.c+12);
1209 else
1210 ctr = ctx->Yi.d[3];
1211
1212 n = ctx->mres;
1213 if (n) {
1214 while (n && len) {
1215 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1216 --len;
1217 n = (n+1)%16;
1218 }
1219 if (n==0) GCM_MUL(ctx,Xi);
1220 else {
1221 ctx->mres = n;
1222 return 0;
1223 }
1224 }
1225 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1226 while (len>=GHASH_CHUNK) {
1227 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1228 ctr += GHASH_CHUNK/16;
1229 if (is_endian.little){
1230 PUTU32(ctx->Yi.c+12,ctr);
1231 }
1232 else
1233 ctx->Yi.d[3] = ctr;
1234 GHASH(ctx,out,GHASH_CHUNK);
1235 out += GHASH_CHUNK;
1236 in += GHASH_CHUNK;
1237 len -= GHASH_CHUNK;
1238 }
1239 #endif
1240 i = (len&(size_t)-16);
1241 if (i) {
1242 size_t j=i/16;
1243
1244 (*stream)(in,out,j,key,ctx->Yi.c);
1245 ctr += (unsigned int)j;
1246 if (is_endian.little){
1247 PUTU32(ctx->Yi.c+12,ctr);
1248 }
1249 else
1250 ctx->Yi.d[3] = ctr;
1251 in += i;
1252 len -= i;
1253 #if defined(GHASH)
1254 GHASH(ctx,out,i);
1255 out += i;
1256 #else
1257 while (j--) {
1258 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1259 GCM_MUL(ctx,Xi);
1260 out += 16;
1261 }
1262 #endif
1263 }
1264 if (len) {
1265 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1266 ++ctr;
1267 if (is_endian.little){
1268 PUTU32(ctx->Yi.c+12,ctr);
1269 }
1270 else
1271 ctx->Yi.d[3] = ctr;
1272 while (len--) {
1273 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1274 ++n;
1275 }
1276 }
1277
1278 ctx->mres = n;
1279 return 0;
1280 }
1281
1282 static int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1283 const unsigned char *in, unsigned char *out,
1284 size_t len,ctr128_f stream)
1285 {
1286 const union { long one; char little; } is_endian = {1};
1287 unsigned int n, ctr;
1288 size_t i;
1289 u64 mlen = ctx->len.u[1];
1290 void *key = ctx->key;
1291 #ifdef GCM_FUNCREF_4BIT
1292 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1293 # ifdef GHASH
1294 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1295 const u8 *inp,size_t len) = ctx->ghash;
1296 # endif
1297 #endif
1298
1299 mlen += len;
1300 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1301 return -1;
1302 ctx->len.u[1] = mlen;
1303
1304 if (ctx->ares) {
1305 /* First call to decrypt finalizes GHASH(AAD) */
1306 GCM_MUL(ctx,Xi);
1307 ctx->ares = 0;
1308 }
1309
1310 if (is_endian.little)
1311 ctr = GETU32(ctx->Yi.c+12);
1312 else
1313 ctr = ctx->Yi.d[3];
1314
1315 n = ctx->mres;
1316 if (n) {
1317 while (n && len) {
1318 u8 c = *(in++);
1319 *(out++) = c^ctx->EKi.c[n];
1320 ctx->Xi.c[n] ^= c;
1321 --len;
1322 n = (n+1)%16;
1323 }
1324 if (n==0) GCM_MUL (ctx,Xi);
1325 else {
1326 ctx->mres = n;
1327 return 0;
1328 }
1329 }
1330 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1331 while (len>=GHASH_CHUNK) {
1332 GHASH(ctx,in,GHASH_CHUNK);
1333 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1334 ctr += GHASH_CHUNK/16;
1335 if (is_endian.little){
1336 PUTU32(ctx->Yi.c+12,ctr);
1337 }
1338 else
1339 ctx->Yi.d[3] = ctr;
1340 out += GHASH_CHUNK;
1341 in += GHASH_CHUNK;
1342 len -= GHASH_CHUNK;
1343 }
1344 #endif
1345 i = (len&(size_t)-16);
1346 if (i) {
1347 size_t j=i/16;
1348
1349 #if defined(GHASH)
1350 GHASH(ctx,in,i);
1351 #else
1352 while (j--) {
1353 size_t k;
1354 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1355 GCM_MUL(ctx,Xi);
1356 in += 16;
1357 }
1358 j = i/16;
1359 in -= i;
1360 #endif
1361 (*stream)(in,out,j,key,ctx->Yi.c);
1362 ctr += (unsigned int)j;
1363 if (is_endian.little){
1364 PUTU32(ctx->Yi.c+12,ctr);
1365 }
1366 else
1367 ctx->Yi.d[3] = ctr;
1368 out += i;
1369 in += i;
1370 len -= i;
1371 }
1372 if (len) {
1373 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1374 ++ctr;
1375 if (is_endian.little){
1376 PUTU32(ctx->Yi.c+12,ctr);
1377 }
1378 else
1379 ctx->Yi.d[3] = ctr;
1380 while (len--) {
1381 u8 c = in[n];
1382 ctx->Xi.c[n] ^= c;
1383 out[n] = c^ctx->EKi.c[n];
1384 ++n;
1385 }
1386 }
1387
1388 ctx->mres = n;
1389 return 0;
1390 }
1391
1392 static int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1393 size_t len)
1394 {
1395 const union { long one; char little; } is_endian = {1};
1396 u64 alen = ctx->len.u[0]<<3;
1397 u64 clen = ctx->len.u[1]<<3;
1398 #ifdef GCM_FUNCREF_4BIT
1399 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1400 #endif
1401
1402 if (ctx->mres || ctx->ares)
1403 GCM_MUL(ctx,Xi);
1404
1405 if (is_endian.little) {
1406 #ifdef BSWAP8
1407 alen = BSWAP8(alen);
1408 clen = BSWAP8(clen);
1409 #else
1410 u8 *p = ctx->len.c;
1411
1412 ctx->len.u[0] = alen;
1413 ctx->len.u[1] = clen;
1414
1415 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1416 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1417 #endif
1418 }
1419
1420 ctx->Xi.u[0] ^= alen;
1421 ctx->Xi.u[1] ^= clen;
1422 GCM_MUL(ctx,Xi);
1423
1424 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1425 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1426
1427 if (tag && len<=sizeof(ctx->Xi))
1428 return memcmp(ctx->Xi.c,tag,len);
1429 else
1430 return -1;
1431 }
1432
1433 static void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1434 {
1435 CRYPTO_gcm128_finish(ctx, NULL, 0);
1436 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1437 }
1438
1439 int rk_sm4_gcm_encrypt(struct sm4_ae_in *in, struct sm4_ae_out *out, const int enc)
1440 {
1441 GCM128_CONTEXT ctx;
1442 sm4_context sm4_ctx;
1443
1444 if(in == NULL || out== NULL)
1445 return -1;
1446
1447 rk_sm4_setkey_enc(&sm4_ctx, in->key);
1448 CRYPTO_gcm128_init(&ctx,&sm4_ctx,rk_rk_sm4_crypt_ecb);
1449 CRYPTO_gcm128_setiv(&ctx,in->iv,in->iv_len);
1450 if (in->aad_len) CRYPTO_gcm128_aad(&ctx,in->aad,in->aad_len);
1451 if(enc){
1452 if (in->src_len) CRYPTO_gcm128_encrypt(&ctx,in->src,out->dest,in->src_len);
1453 CRYPTO_gcm128_tag(&ctx, out->tag, in->tag_size);
1454 return 0;
1455 }else{
1456 if (in->src_len) CRYPTO_gcm128_decrypt(&ctx,in->src,out->dest,in->src_len);
1457 CRYPTO_gcm128_tag(&ctx,out->tag,in->tag_size);
1458 return 0;
1459 }
1460 }
1461
1462
1463