xref: /OK3568_Linux_fs/kernel/include/crypto/gf128mul.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /* gf128mul.h - GF(2^128) multiplication functions
2*4882a593Smuzhiyun  *
3*4882a593Smuzhiyun  * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
4*4882a593Smuzhiyun  * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org>
5*4882a593Smuzhiyun  *
6*4882a593Smuzhiyun  * Based on Dr Brian Gladman's (GPL'd) work published at
7*4882a593Smuzhiyun  * http://fp.gladman.plus.com/cryptography_technology/index.htm
8*4882a593Smuzhiyun  * See the original copyright notice below.
9*4882a593Smuzhiyun  *
10*4882a593Smuzhiyun  * This program is free software; you can redistribute it and/or modify it
11*4882a593Smuzhiyun  * under the terms of the GNU General Public License as published by the Free
12*4882a593Smuzhiyun  * Software Foundation; either version 2 of the License, or (at your option)
13*4882a593Smuzhiyun  * any later version.
14*4882a593Smuzhiyun  */
15*4882a593Smuzhiyun /*
16*4882a593Smuzhiyun  ---------------------------------------------------------------------------
17*4882a593Smuzhiyun  Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
18*4882a593Smuzhiyun 
19*4882a593Smuzhiyun  LICENSE TERMS
20*4882a593Smuzhiyun 
21*4882a593Smuzhiyun  The free distribution and use of this software in both source and binary
22*4882a593Smuzhiyun  form is allowed (with or without changes) provided that:
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun    1. distributions of this source code include the above copyright
25*4882a593Smuzhiyun       notice, this list of conditions and the following disclaimer;
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun    2. distributions in binary form include the above copyright
28*4882a593Smuzhiyun       notice, this list of conditions and the following disclaimer
29*4882a593Smuzhiyun       in the documentation and/or other associated materials;
30*4882a593Smuzhiyun 
31*4882a593Smuzhiyun    3. the copyright holder's name is not used to endorse products
32*4882a593Smuzhiyun       built using this software without specific written permission.
33*4882a593Smuzhiyun 
34*4882a593Smuzhiyun  ALTERNATIVELY, provided that this notice is retained in full, this product
35*4882a593Smuzhiyun  may be distributed under the terms of the GNU General Public License (GPL),
36*4882a593Smuzhiyun  in which case the provisions of the GPL apply INSTEAD OF those given above.
37*4882a593Smuzhiyun 
38*4882a593Smuzhiyun  DISCLAIMER
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun  This software is provided 'as is' with no explicit or implied warranties
41*4882a593Smuzhiyun  in respect of its properties, including, but not limited to, correctness
42*4882a593Smuzhiyun  and/or fitness for purpose.
43*4882a593Smuzhiyun  ---------------------------------------------------------------------------
44*4882a593Smuzhiyun  Issue Date: 31/01/2006
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun  An implementation of field multiplication in Galois Field GF(2^128)
47*4882a593Smuzhiyun */
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun #ifndef _CRYPTO_GF128MUL_H
50*4882a593Smuzhiyun #define _CRYPTO_GF128MUL_H
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun #include <asm/byteorder.h>
53*4882a593Smuzhiyun #include <crypto/b128ops.h>
54*4882a593Smuzhiyun #include <linux/slab.h>
55*4882a593Smuzhiyun 
56*4882a593Smuzhiyun /* Comment by Rik:
57*4882a593Smuzhiyun  *
58*4882a593Smuzhiyun  * For some background on GF(2^128) see for example:
59*4882a593Smuzhiyun  * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
60*4882a593Smuzhiyun  *
61*4882a593Smuzhiyun  * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can
62*4882a593Smuzhiyun  * be mapped to computer memory in a variety of ways. Let's examine
63*4882a593Smuzhiyun  * three common cases.
64*4882a593Smuzhiyun  *
65*4882a593Smuzhiyun  * Take a look at the 16 binary octets below in memory order. The msb's
66*4882a593Smuzhiyun  * are left and the lsb's are right. char b[16] is an array and b[0] is
67*4882a593Smuzhiyun  * the first octet.
68*4882a593Smuzhiyun  *
69*4882a593Smuzhiyun  * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000
70*4882a593Smuzhiyun  *   b[0]     b[1]     b[2]     b[3]          b[13]    b[14]    b[15]
71*4882a593Smuzhiyun  *
72*4882a593Smuzhiyun  * Every bit is a coefficient of some power of X. We can store the bits
73*4882a593Smuzhiyun  * in every byte in little-endian order and the bytes themselves also in
74*4882a593Smuzhiyun  * little endian order. I will call this lle (little-little-endian).
75*4882a593Smuzhiyun  * The above buffer represents the polynomial 1, and X^7+X^2+X^1+1 looks
76*4882a593Smuzhiyun  * like 11100001 00000000 .... 00000000 = { 0xE1, 0x00, }.
77*4882a593Smuzhiyun  * This format was originally implemented in gf128mul and is used
78*4882a593Smuzhiyun  * in GCM (Galois/Counter mode) and in ABL (Arbitrary Block Length).
79*4882a593Smuzhiyun  *
80*4882a593Smuzhiyun  * Another convention says: store the bits in bigendian order and the
81*4882a593Smuzhiyun  * bytes also. This is bbe (big-big-endian). Now the buffer above
82*4882a593Smuzhiyun  * represents X^127. X^7+X^2+X^1+1 looks like 00000000 .... 10000111,
83*4882a593Smuzhiyun  * b[15] = 0x87 and the rest is 0. LRW uses this convention and bbe
84*4882a593Smuzhiyun  * is partly implemented.
85*4882a593Smuzhiyun  *
86*4882a593Smuzhiyun  * Both of the above formats are easy to implement on big-endian
87*4882a593Smuzhiyun  * machines.
88*4882a593Smuzhiyun  *
89*4882a593Smuzhiyun  * XTS and EME (the latter of which is patent encumbered) use the ble
90*4882a593Smuzhiyun  * format (bits are stored in big endian order and the bytes in little
91*4882a593Smuzhiyun  * endian). The above buffer represents X^7 in this case and the
92*4882a593Smuzhiyun  * primitive polynomial is b[0] = 0x87.
93*4882a593Smuzhiyun  *
94*4882a593Smuzhiyun  * The common machine word-size is smaller than 128 bits, so to make
95*4882a593Smuzhiyun  * an efficient implementation we must split into machine word sizes.
96*4882a593Smuzhiyun  * This implementation uses 64-bit words for the moment. Machine
97*4882a593Smuzhiyun  * endianness comes into play. The lle format in relation to machine
98*4882a593Smuzhiyun  * endianness is discussed below by the original author of gf128mul Dr
99*4882a593Smuzhiyun  * Brian Gladman.
100*4882a593Smuzhiyun  *
101*4882a593Smuzhiyun  * Let's look at the bbe and ble format on a little endian machine.
102*4882a593Smuzhiyun  *
103*4882a593Smuzhiyun  * bbe on a little endian machine u32 x[4]:
104*4882a593Smuzhiyun  *
105*4882a593Smuzhiyun  *  MS            x[0]           LS  MS            x[1]		  LS
106*4882a593Smuzhiyun  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
107*4882a593Smuzhiyun  *  103..96 111.104 119.112 127.120  71...64 79...72 87...80 95...88
108*4882a593Smuzhiyun  *
109*4882a593Smuzhiyun  *  MS            x[2]           LS  MS            x[3]		  LS
110*4882a593Smuzhiyun  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
111*4882a593Smuzhiyun  *  39...32 47...40 55...48 63...56  07...00 15...08 23...16 31...24
112*4882a593Smuzhiyun  *
113*4882a593Smuzhiyun  * ble on a little endian machine
114*4882a593Smuzhiyun  *
115*4882a593Smuzhiyun  *  MS            x[0]           LS  MS            x[1]		  LS
116*4882a593Smuzhiyun  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
117*4882a593Smuzhiyun  *  31...24 23...16 15...08 07...00  63...56 55...48 47...40 39...32
118*4882a593Smuzhiyun  *
119*4882a593Smuzhiyun  *  MS            x[2]           LS  MS            x[3]		  LS
120*4882a593Smuzhiyun  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
121*4882a593Smuzhiyun  *  95...88 87...80 79...72 71...64  127.120 199.112 111.104 103..96
122*4882a593Smuzhiyun  *
123*4882a593Smuzhiyun  * Multiplications in GF(2^128) are mostly bit-shifts, so you see why
124*4882a593Smuzhiyun  * ble (and lbe also) are easier to implement on a little-endian
125*4882a593Smuzhiyun  * machine than on a big-endian machine. The converse holds for bbe
126*4882a593Smuzhiyun  * and lle.
127*4882a593Smuzhiyun  *
128*4882a593Smuzhiyun  * Note: to have good alignment, it seems to me that it is sufficient
129*4882a593Smuzhiyun  * to keep elements of GF(2^128) in type u64[2]. On 32-bit wordsize
130*4882a593Smuzhiyun  * machines this will automatically aligned to wordsize and on a 64-bit
131*4882a593Smuzhiyun  * machine also.
132*4882a593Smuzhiyun  */
133*4882a593Smuzhiyun /*	Multiply a GF(2^128) field element by x. Field elements are
134*4882a593Smuzhiyun     held in arrays of bytes in which field bits 8n..8n + 7 are held in
135*4882a593Smuzhiyun     byte[n], with lower indexed bits placed in the more numerically
136*4882a593Smuzhiyun     significant bit positions within bytes.
137*4882a593Smuzhiyun 
138*4882a593Smuzhiyun     On little endian machines the bit indexes translate into the bit
139*4882a593Smuzhiyun     positions within four 32-bit words in the following way
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun     MS            x[0]           LS  MS            x[1]		  LS
142*4882a593Smuzhiyun     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
143*4882a593Smuzhiyun     24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun     MS            x[2]           LS  MS            x[3]		  LS
146*4882a593Smuzhiyun     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
147*4882a593Smuzhiyun     88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
148*4882a593Smuzhiyun 
149*4882a593Smuzhiyun     On big endian machines the bit indexes translate into the bit
150*4882a593Smuzhiyun     positions within four 32-bit words in the following way
151*4882a593Smuzhiyun 
152*4882a593Smuzhiyun     MS            x[0]           LS  MS            x[1]		  LS
153*4882a593Smuzhiyun     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
154*4882a593Smuzhiyun     00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
155*4882a593Smuzhiyun 
156*4882a593Smuzhiyun     MS            x[2]           LS  MS            x[3]		  LS
157*4882a593Smuzhiyun     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
158*4882a593Smuzhiyun     64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
159*4882a593Smuzhiyun */
160*4882a593Smuzhiyun 
161*4882a593Smuzhiyun /*	A slow generic version of gf_mul, implemented for lle and bbe
162*4882a593Smuzhiyun  * 	It multiplies a and b and puts the result in a */
163*4882a593Smuzhiyun void gf128mul_lle(be128 *a, const be128 *b);
164*4882a593Smuzhiyun 
165*4882a593Smuzhiyun void gf128mul_bbe(be128 *a, const be128 *b);
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun /*
168*4882a593Smuzhiyun  * The following functions multiply a field element by x in
169*4882a593Smuzhiyun  * the polynomial field representation.  They use 64-bit word operations
170*4882a593Smuzhiyun  * to gain speed but compensate for machine endianness and hence work
171*4882a593Smuzhiyun  * correctly on both styles of machine.
172*4882a593Smuzhiyun  *
173*4882a593Smuzhiyun  * They are defined here for performance.
174*4882a593Smuzhiyun  */
175*4882a593Smuzhiyun 
gf128mul_mask_from_bit(u64 x,int which)176*4882a593Smuzhiyun static inline u64 gf128mul_mask_from_bit(u64 x, int which)
177*4882a593Smuzhiyun {
178*4882a593Smuzhiyun 	/* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
179*4882a593Smuzhiyun 	return ((s64)(x << (63 - which)) >> 63);
180*4882a593Smuzhiyun }
181*4882a593Smuzhiyun 
gf128mul_x_lle(be128 * r,const be128 * x)182*4882a593Smuzhiyun static inline void gf128mul_x_lle(be128 *r, const be128 *x)
183*4882a593Smuzhiyun {
184*4882a593Smuzhiyun 	u64 a = be64_to_cpu(x->a);
185*4882a593Smuzhiyun 	u64 b = be64_to_cpu(x->b);
186*4882a593Smuzhiyun 
187*4882a593Smuzhiyun 	/* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48
188*4882a593Smuzhiyun 	 * (see crypto/gf128mul.c): */
189*4882a593Smuzhiyun 	u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56);
190*4882a593Smuzhiyun 
191*4882a593Smuzhiyun 	r->b = cpu_to_be64((b >> 1) | (a << 63));
192*4882a593Smuzhiyun 	r->a = cpu_to_be64((a >> 1) ^ _tt);
193*4882a593Smuzhiyun }
194*4882a593Smuzhiyun 
gf128mul_x_bbe(be128 * r,const be128 * x)195*4882a593Smuzhiyun static inline void gf128mul_x_bbe(be128 *r, const be128 *x)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun 	u64 a = be64_to_cpu(x->a);
198*4882a593Smuzhiyun 	u64 b = be64_to_cpu(x->b);
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	/* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
201*4882a593Smuzhiyun 	u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun 	r->a = cpu_to_be64((a << 1) | (b >> 63));
204*4882a593Smuzhiyun 	r->b = cpu_to_be64((b << 1) ^ _tt);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun /* needed by XTS */
gf128mul_x_ble(le128 * r,const le128 * x)208*4882a593Smuzhiyun static inline void gf128mul_x_ble(le128 *r, const le128 *x)
209*4882a593Smuzhiyun {
210*4882a593Smuzhiyun 	u64 a = le64_to_cpu(x->a);
211*4882a593Smuzhiyun 	u64 b = le64_to_cpu(x->b);
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	/* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
214*4882a593Smuzhiyun 	u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
215*4882a593Smuzhiyun 
216*4882a593Smuzhiyun 	r->a = cpu_to_le64((a << 1) | (b >> 63));
217*4882a593Smuzhiyun 	r->b = cpu_to_le64((b << 1) ^ _tt);
218*4882a593Smuzhiyun }
219*4882a593Smuzhiyun 
220*4882a593Smuzhiyun /* 4k table optimization */
221*4882a593Smuzhiyun 
222*4882a593Smuzhiyun struct gf128mul_4k {
223*4882a593Smuzhiyun 	be128 t[256];
224*4882a593Smuzhiyun };
225*4882a593Smuzhiyun 
226*4882a593Smuzhiyun struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g);
227*4882a593Smuzhiyun struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g);
228*4882a593Smuzhiyun void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t);
229*4882a593Smuzhiyun void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t);
230*4882a593Smuzhiyun void gf128mul_x8_ble(le128 *r, const le128 *x);
gf128mul_free_4k(struct gf128mul_4k * t)231*4882a593Smuzhiyun static inline void gf128mul_free_4k(struct gf128mul_4k *t)
232*4882a593Smuzhiyun {
233*4882a593Smuzhiyun 	kfree_sensitive(t);
234*4882a593Smuzhiyun }
235*4882a593Smuzhiyun 
236*4882a593Smuzhiyun 
237*4882a593Smuzhiyun /* 64k table optimization, implemented for bbe */
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun struct gf128mul_64k {
240*4882a593Smuzhiyun 	struct gf128mul_4k *t[16];
241*4882a593Smuzhiyun };
242*4882a593Smuzhiyun 
243*4882a593Smuzhiyun /* First initialize with the constant factor with which you
244*4882a593Smuzhiyun  * want to multiply and then call gf128mul_64k_bbe with the other
245*4882a593Smuzhiyun  * factor in the first argument, and the table in the second.
246*4882a593Smuzhiyun  * Afterwards, the result is stored in *a.
247*4882a593Smuzhiyun  */
248*4882a593Smuzhiyun struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g);
249*4882a593Smuzhiyun void gf128mul_free_64k(struct gf128mul_64k *t);
250*4882a593Smuzhiyun void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t);
251*4882a593Smuzhiyun 
252*4882a593Smuzhiyun #endif /* _CRYPTO_GF128MUL_H */
253