xref: /OK3568_Linux_fs/kernel/drivers/block/drbd/drbd_vli.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun -*- linux-c -*-
4*4882a593Smuzhiyun    drbd_receiver.c
5*4882a593Smuzhiyun    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8*4882a593Smuzhiyun    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9*4882a593Smuzhiyun    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10*4882a593Smuzhiyun 
11*4882a593Smuzhiyun  */
12*4882a593Smuzhiyun 
13*4882a593Smuzhiyun #ifndef _DRBD_VLI_H
14*4882a593Smuzhiyun #define _DRBD_VLI_H
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun /*
17*4882a593Smuzhiyun  * At a granularity of 4KiB storage represented per bit,
18*4882a593Smuzhiyun  * and stroage sizes of several TiB,
19*4882a593Smuzhiyun  * and possibly small-bandwidth replication,
20*4882a593Smuzhiyun  * the bitmap transfer time can take much too long,
21*4882a593Smuzhiyun  * if transmitted in plain text.
22*4882a593Smuzhiyun  *
23*4882a593Smuzhiyun  * We try to reduce the transferred bitmap information
24*4882a593Smuzhiyun  * by encoding runlengths of bit polarity.
25*4882a593Smuzhiyun  *
26*4882a593Smuzhiyun  * We never actually need to encode a "zero" (runlengths are positive).
27*4882a593Smuzhiyun  * But then we have to store the value of the first bit.
28*4882a593Smuzhiyun  * The first bit of information thus shall encode if the first runlength
29*4882a593Smuzhiyun  * gives the number of set or unset bits.
30*4882a593Smuzhiyun  *
31*4882a593Smuzhiyun  * We assume that large areas are either completely set or unset,
32*4882a593Smuzhiyun  * which gives good compression with any runlength method,
33*4882a593Smuzhiyun  * even when encoding the runlength as fixed size 32bit/64bit integers.
34*4882a593Smuzhiyun  *
35*4882a593Smuzhiyun  * Still, there may be areas where the polarity flips every few bits,
36*4882a593Smuzhiyun  * and encoding the runlength sequence of those areas with fix size
37*4882a593Smuzhiyun  * integers would be much worse than plaintext.
38*4882a593Smuzhiyun  *
39*4882a593Smuzhiyun  * We want to encode small runlength values with minimum code length,
40*4882a593Smuzhiyun  * while still being able to encode a Huge run of all zeros.
41*4882a593Smuzhiyun  *
42*4882a593Smuzhiyun  * Thus we need a Variable Length Integer encoding, VLI.
43*4882a593Smuzhiyun  *
44*4882a593Smuzhiyun  * For some cases, we produce more code bits than plaintext input.
45*4882a593Smuzhiyun  * We need to send incompressible chunks as plaintext, skip over them
46*4882a593Smuzhiyun  * and then see if the next chunk compresses better.
47*4882a593Smuzhiyun  *
48*4882a593Smuzhiyun  * We don't care too much about "excellent" compression ratio for large
49*4882a593Smuzhiyun  * runlengths (all set/all clear): whether we achieve a factor of 100
50*4882a593Smuzhiyun  * or 1000 is not that much of an issue.
51*4882a593Smuzhiyun  * We do not want to waste too much on short runlengths in the "noisy"
52*4882a593Smuzhiyun  * parts of the bitmap, though.
53*4882a593Smuzhiyun  *
54*4882a593Smuzhiyun  * There are endless variants of VLI, we experimented with:
55*4882a593Smuzhiyun  *  * simple byte-based
56*4882a593Smuzhiyun  *  * various bit based with different code word length.
57*4882a593Smuzhiyun  *
58*4882a593Smuzhiyun  * To avoid yet an other configuration parameter (choice of bitmap compression
59*4882a593Smuzhiyun  * algorithm) which was difficult to explain and tune, we just chose the one
60*4882a593Smuzhiyun  * variant that turned out best in all test cases.
61*4882a593Smuzhiyun  * Based on real world usage patterns, with device sizes ranging from a few GiB
62*4882a593Smuzhiyun  * to several TiB, file server/mailserver/webserver/mysql/postgress,
63*4882a593Smuzhiyun  * mostly idle to really busy, the all time winner (though sometimes only
64*4882a593Smuzhiyun  * marginally better) is:
65*4882a593Smuzhiyun  */
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun  * encoding is "visualised" as
69*4882a593Smuzhiyun  * __little endian__ bitstream, least significant bit first (left most)
70*4882a593Smuzhiyun  *
71*4882a593Smuzhiyun  * this particular encoding is chosen so that the prefix code
72*4882a593Smuzhiyun  * starts as unary encoding the level, then modified so that
73*4882a593Smuzhiyun  * 10 levels can be described in 8bit, with minimal overhead
74*4882a593Smuzhiyun  * for the smaller levels.
75*4882a593Smuzhiyun  *
76*4882a593Smuzhiyun  * Number of data bits follow fibonacci sequence, with the exception of the
77*4882a593Smuzhiyun  * last level (+1 data bit, so it makes 64bit total).  The only worse code when
78*4882a593Smuzhiyun  * encoding bit polarity runlength is 1 plain bits => 2 code bits.
79*4882a593Smuzhiyun prefix    data bits                                    max val  Nº data bits
80*4882a593Smuzhiyun 0 x                                                         0x2            1
81*4882a593Smuzhiyun 10 x                                                        0x4            1
82*4882a593Smuzhiyun 110 xx                                                      0x8            2
83*4882a593Smuzhiyun 1110 xxx                                                   0x10            3
84*4882a593Smuzhiyun 11110 xxx xx                                               0x30            5
85*4882a593Smuzhiyun 111110 xx xxxxxx                                          0x130            8
86*4882a593Smuzhiyun 11111100  xxxxxxxx xxxxx                                 0x2130           13
87*4882a593Smuzhiyun 11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
88*4882a593Smuzhiyun 11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
89*4882a593Smuzhiyun 11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
90*4882a593Smuzhiyun  * maximum encodable value: 0x100000400202130 == 2**56 + some */
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun /* compression "table":
93*4882a593Smuzhiyun  transmitted   x                                0.29
94*4882a593Smuzhiyun  as plaintext x                                  ........................
95*4882a593Smuzhiyun              x                                   ........................
96*4882a593Smuzhiyun             x                                    ........................
97*4882a593Smuzhiyun            x    0.59                         0.21........................
98*4882a593Smuzhiyun           x      ........................................................
99*4882a593Smuzhiyun          x       .. c ...................................................
100*4882a593Smuzhiyun         x    0.44.. o ...................................................
101*4882a593Smuzhiyun        x .......... d ...................................................
102*4882a593Smuzhiyun       x  .......... e ...................................................
103*4882a593Smuzhiyun      X.............   ...................................................
104*4882a593Smuzhiyun     x.............. b ...................................................
105*4882a593Smuzhiyun 2.0x............... i ...................................................
106*4882a593Smuzhiyun  #X................ t ...................................................
107*4882a593Smuzhiyun  #................. s ...........................  plain bits  ..........
108*4882a593Smuzhiyun -+-----------------------------------------------------------------------
109*4882a593Smuzhiyun  1             16              32                              64
110*4882a593Smuzhiyun */
111*4882a593Smuzhiyun 
112*4882a593Smuzhiyun /* LEVEL: (total bits, prefix bits, prefix value),
113*4882a593Smuzhiyun  * sorted ascending by number of total bits.
114*4882a593Smuzhiyun  * The rest of the code table is calculated at compiletime from this. */
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun /* fibonacci data 1, 1, ... */
117*4882a593Smuzhiyun #define VLI_L_1_1() do { \
118*4882a593Smuzhiyun 	LEVEL( 2, 1, 0x00); \
119*4882a593Smuzhiyun 	LEVEL( 3, 2, 0x01); \
120*4882a593Smuzhiyun 	LEVEL( 5, 3, 0x03); \
121*4882a593Smuzhiyun 	LEVEL( 7, 4, 0x07); \
122*4882a593Smuzhiyun 	LEVEL(10, 5, 0x0f); \
123*4882a593Smuzhiyun 	LEVEL(14, 6, 0x1f); \
124*4882a593Smuzhiyun 	LEVEL(21, 8, 0x3f); \
125*4882a593Smuzhiyun 	LEVEL(29, 8, 0x7f); \
126*4882a593Smuzhiyun 	LEVEL(42, 8, 0xbf); \
127*4882a593Smuzhiyun 	LEVEL(64, 8, 0xff); \
128*4882a593Smuzhiyun 	} while (0)
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun /* finds a suitable level to decode the least significant part of in.
131*4882a593Smuzhiyun  * returns number of bits consumed.
132*4882a593Smuzhiyun  *
133*4882a593Smuzhiyun  * BUG() for bad input, as that would mean a buggy code table. */
vli_decode_bits(u64 * out,const u64 in)134*4882a593Smuzhiyun static inline int vli_decode_bits(u64 *out, const u64 in)
135*4882a593Smuzhiyun {
136*4882a593Smuzhiyun 	u64 adj = 1;
137*4882a593Smuzhiyun 
138*4882a593Smuzhiyun #define LEVEL(t,b,v)					\
139*4882a593Smuzhiyun 	do {						\
140*4882a593Smuzhiyun 		if ((in & ((1 << b) -1)) == v) {	\
141*4882a593Smuzhiyun 			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
142*4882a593Smuzhiyun 			return t;			\
143*4882a593Smuzhiyun 		}					\
144*4882a593Smuzhiyun 		adj += 1ULL << (t - b);			\
145*4882a593Smuzhiyun 	} while (0)
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun 	VLI_L_1_1();
148*4882a593Smuzhiyun 
149*4882a593Smuzhiyun 	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
150*4882a593Smuzhiyun 	BUG();
151*4882a593Smuzhiyun #undef LEVEL
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun /* return number of code bits needed,
155*4882a593Smuzhiyun  * or negative error number */
__vli_encode_bits(u64 * out,const u64 in)156*4882a593Smuzhiyun static inline int __vli_encode_bits(u64 *out, const u64 in)
157*4882a593Smuzhiyun {
158*4882a593Smuzhiyun 	u64 max = 0;
159*4882a593Smuzhiyun 	u64 adj = 1;
160*4882a593Smuzhiyun 
161*4882a593Smuzhiyun 	if (in == 0)
162*4882a593Smuzhiyun 		return -EINVAL;
163*4882a593Smuzhiyun 
164*4882a593Smuzhiyun #define LEVEL(t,b,v) do {		\
165*4882a593Smuzhiyun 		max += 1ULL << (t - b);	\
166*4882a593Smuzhiyun 		if (in <= max) {	\
167*4882a593Smuzhiyun 			if (out)	\
168*4882a593Smuzhiyun 				*out = ((in - adj) << b) | v;	\
169*4882a593Smuzhiyun 			return t;	\
170*4882a593Smuzhiyun 		}			\
171*4882a593Smuzhiyun 		adj = max + 1;		\
172*4882a593Smuzhiyun 	} while (0)
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun 	VLI_L_1_1();
175*4882a593Smuzhiyun 
176*4882a593Smuzhiyun 	return -EOVERFLOW;
177*4882a593Smuzhiyun #undef LEVEL
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun 
180*4882a593Smuzhiyun #undef VLI_L_1_1
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun /* code from here down is independend of actually used bit code */
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun /*
185*4882a593Smuzhiyun  * Code length is determined by some unique (e.g. unary) prefix.
186*4882a593Smuzhiyun  * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
187*4882a593Smuzhiyun  * not a byte stream.
188*4882a593Smuzhiyun  */
189*4882a593Smuzhiyun 
190*4882a593Smuzhiyun /* for the bitstream, we need a cursor */
191*4882a593Smuzhiyun struct bitstream_cursor {
192*4882a593Smuzhiyun 	/* the current byte */
193*4882a593Smuzhiyun 	u8 *b;
194*4882a593Smuzhiyun 	/* the current bit within *b, nomalized: 0..7 */
195*4882a593Smuzhiyun 	unsigned int bit;
196*4882a593Smuzhiyun };
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun /* initialize cursor to point to first bit of stream */
bitstream_cursor_reset(struct bitstream_cursor * cur,void * s)199*4882a593Smuzhiyun static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun 	cur->b = s;
202*4882a593Smuzhiyun 	cur->bit = 0;
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun 
205*4882a593Smuzhiyun /* advance cursor by that many bits; maximum expected input value: 64,
206*4882a593Smuzhiyun  * but depending on VLI implementation, it may be more. */
bitstream_cursor_advance(struct bitstream_cursor * cur,unsigned int bits)207*4882a593Smuzhiyun static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
208*4882a593Smuzhiyun {
209*4882a593Smuzhiyun 	bits += cur->bit;
210*4882a593Smuzhiyun 	cur->b = cur->b + (bits >> 3);
211*4882a593Smuzhiyun 	cur->bit = bits & 7;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun 
214*4882a593Smuzhiyun /* the bitstream itself knows its length */
215*4882a593Smuzhiyun struct bitstream {
216*4882a593Smuzhiyun 	struct bitstream_cursor cur;
217*4882a593Smuzhiyun 	unsigned char *buf;
218*4882a593Smuzhiyun 	size_t buf_len;		/* in bytes */
219*4882a593Smuzhiyun 
220*4882a593Smuzhiyun 	/* for input stream:
221*4882a593Smuzhiyun 	 * number of trailing 0 bits for padding
222*4882a593Smuzhiyun 	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
223*4882a593Smuzhiyun 	unsigned int pad_bits;
224*4882a593Smuzhiyun };
225*4882a593Smuzhiyun 
bitstream_init(struct bitstream * bs,void * s,size_t len,unsigned int pad_bits)226*4882a593Smuzhiyun static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
227*4882a593Smuzhiyun {
228*4882a593Smuzhiyun 	bs->buf = s;
229*4882a593Smuzhiyun 	bs->buf_len = len;
230*4882a593Smuzhiyun 	bs->pad_bits = pad_bits;
231*4882a593Smuzhiyun 	bitstream_cursor_reset(&bs->cur, bs->buf);
232*4882a593Smuzhiyun }
233*4882a593Smuzhiyun 
bitstream_rewind(struct bitstream * bs)234*4882a593Smuzhiyun static inline void bitstream_rewind(struct bitstream *bs)
235*4882a593Smuzhiyun {
236*4882a593Smuzhiyun 	bitstream_cursor_reset(&bs->cur, bs->buf);
237*4882a593Smuzhiyun 	memset(bs->buf, 0, bs->buf_len);
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun 
240*4882a593Smuzhiyun /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
241*4882a593Smuzhiyun  * Ignores "pad_bits".
242*4882a593Smuzhiyun  * Returns zero if bits == 0 (nothing to do).
243*4882a593Smuzhiyun  * Returns number of bits used if successful.
244*4882a593Smuzhiyun  *
245*4882a593Smuzhiyun  * If there is not enough room left in bitstream,
246*4882a593Smuzhiyun  * leaves bitstream unchanged and returns -ENOBUFS.
247*4882a593Smuzhiyun  */
bitstream_put_bits(struct bitstream * bs,u64 val,const unsigned int bits)248*4882a593Smuzhiyun static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
249*4882a593Smuzhiyun {
250*4882a593Smuzhiyun 	unsigned char *b = bs->cur.b;
251*4882a593Smuzhiyun 	unsigned int tmp;
252*4882a593Smuzhiyun 
253*4882a593Smuzhiyun 	if (bits == 0)
254*4882a593Smuzhiyun 		return 0;
255*4882a593Smuzhiyun 
256*4882a593Smuzhiyun 	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
257*4882a593Smuzhiyun 		return -ENOBUFS;
258*4882a593Smuzhiyun 
259*4882a593Smuzhiyun 	/* paranoia: strip off hi bits; they should not be set anyways. */
260*4882a593Smuzhiyun 	if (bits < 64)
261*4882a593Smuzhiyun 		val &= ~0ULL >> (64 - bits);
262*4882a593Smuzhiyun 
263*4882a593Smuzhiyun 	*b++ |= (val & 0xff) << bs->cur.bit;
264*4882a593Smuzhiyun 
265*4882a593Smuzhiyun 	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
266*4882a593Smuzhiyun 		*b++ |= (val >> tmp) & 0xff;
267*4882a593Smuzhiyun 
268*4882a593Smuzhiyun 	bitstream_cursor_advance(&bs->cur, bits);
269*4882a593Smuzhiyun 	return bits;
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun 
272*4882a593Smuzhiyun /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
273*4882a593Smuzhiyun  *
274*4882a593Smuzhiyun  * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
275*4882a593Smuzhiyun  *
276*4882a593Smuzhiyun  * If there are less than the requested number of valid bits left in the
277*4882a593Smuzhiyun  * bitstream, still fetches all available bits.
278*4882a593Smuzhiyun  *
279*4882a593Smuzhiyun  * Returns number of actually fetched bits.
280*4882a593Smuzhiyun  */
bitstream_get_bits(struct bitstream * bs,u64 * out,int bits)281*4882a593Smuzhiyun static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
282*4882a593Smuzhiyun {
283*4882a593Smuzhiyun 	u64 val;
284*4882a593Smuzhiyun 	unsigned int n;
285*4882a593Smuzhiyun 
286*4882a593Smuzhiyun 	if (bits > 64)
287*4882a593Smuzhiyun 		return -EINVAL;
288*4882a593Smuzhiyun 
289*4882a593Smuzhiyun 	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
290*4882a593Smuzhiyun 		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
291*4882a593Smuzhiyun 			- bs->cur.bit - bs->pad_bits;
292*4882a593Smuzhiyun 
293*4882a593Smuzhiyun 	if (bits == 0) {
294*4882a593Smuzhiyun 		*out = 0;
295*4882a593Smuzhiyun 		return 0;
296*4882a593Smuzhiyun 	}
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 	/* get the high bits */
299*4882a593Smuzhiyun 	val = 0;
300*4882a593Smuzhiyun 	n = (bs->cur.bit + bits + 7) >> 3;
301*4882a593Smuzhiyun 	/* n may be at most 9, if cur.bit + bits > 64 */
302*4882a593Smuzhiyun 	/* which means this copies at most 8 byte */
303*4882a593Smuzhiyun 	if (n) {
304*4882a593Smuzhiyun 		memcpy(&val, bs->cur.b+1, n - 1);
305*4882a593Smuzhiyun 		val = le64_to_cpu(val) << (8 - bs->cur.bit);
306*4882a593Smuzhiyun 	}
307*4882a593Smuzhiyun 
308*4882a593Smuzhiyun 	/* we still need the low bits */
309*4882a593Smuzhiyun 	val |= bs->cur.b[0] >> bs->cur.bit;
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun 	/* and mask out bits we don't want */
312*4882a593Smuzhiyun 	val &= ~0ULL >> (64 - bits);
313*4882a593Smuzhiyun 
314*4882a593Smuzhiyun 	bitstream_cursor_advance(&bs->cur, bits);
315*4882a593Smuzhiyun 	*out = val;
316*4882a593Smuzhiyun 
317*4882a593Smuzhiyun 	return bits;
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun 
320*4882a593Smuzhiyun /* encodes @in as vli into @bs;
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun  * return values
323*4882a593Smuzhiyun  *  > 0: number of bits successfully stored in bitstream
324*4882a593Smuzhiyun  * -ENOBUFS @bs is full
325*4882a593Smuzhiyun  * -EINVAL input zero (invalid)
326*4882a593Smuzhiyun  * -EOVERFLOW input too large for this vli code (invalid)
327*4882a593Smuzhiyun  */
vli_encode_bits(struct bitstream * bs,u64 in)328*4882a593Smuzhiyun static inline int vli_encode_bits(struct bitstream *bs, u64 in)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun 	u64 code = code;
331*4882a593Smuzhiyun 	int bits = __vli_encode_bits(&code, in);
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun 	if (bits <= 0)
334*4882a593Smuzhiyun 		return bits;
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun 	return bitstream_put_bits(bs, code, bits);
337*4882a593Smuzhiyun }
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun #endif
340