1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun -*- linux-c -*-
4*4882a593Smuzhiyun drbd_receiver.c
5*4882a593Smuzhiyun This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8*4882a593Smuzhiyun Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9*4882a593Smuzhiyun Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun #ifndef _DRBD_VLI_H
14*4882a593Smuzhiyun #define _DRBD_VLI_H
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun /*
17*4882a593Smuzhiyun * At a granularity of 4KiB storage represented per bit,
18*4882a593Smuzhiyun * and stroage sizes of several TiB,
19*4882a593Smuzhiyun * and possibly small-bandwidth replication,
20*4882a593Smuzhiyun * the bitmap transfer time can take much too long,
21*4882a593Smuzhiyun * if transmitted in plain text.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * We try to reduce the transferred bitmap information
24*4882a593Smuzhiyun * by encoding runlengths of bit polarity.
25*4882a593Smuzhiyun *
26*4882a593Smuzhiyun * We never actually need to encode a "zero" (runlengths are positive).
27*4882a593Smuzhiyun * But then we have to store the value of the first bit.
28*4882a593Smuzhiyun * The first bit of information thus shall encode if the first runlength
29*4882a593Smuzhiyun * gives the number of set or unset bits.
30*4882a593Smuzhiyun *
31*4882a593Smuzhiyun * We assume that large areas are either completely set or unset,
32*4882a593Smuzhiyun * which gives good compression with any runlength method,
33*4882a593Smuzhiyun * even when encoding the runlength as fixed size 32bit/64bit integers.
34*4882a593Smuzhiyun *
35*4882a593Smuzhiyun * Still, there may be areas where the polarity flips every few bits,
36*4882a593Smuzhiyun * and encoding the runlength sequence of those areas with fix size
37*4882a593Smuzhiyun * integers would be much worse than plaintext.
38*4882a593Smuzhiyun *
39*4882a593Smuzhiyun * We want to encode small runlength values with minimum code length,
40*4882a593Smuzhiyun * while still being able to encode a Huge run of all zeros.
41*4882a593Smuzhiyun *
42*4882a593Smuzhiyun * Thus we need a Variable Length Integer encoding, VLI.
43*4882a593Smuzhiyun *
44*4882a593Smuzhiyun * For some cases, we produce more code bits than plaintext input.
45*4882a593Smuzhiyun * We need to send incompressible chunks as plaintext, skip over them
46*4882a593Smuzhiyun * and then see if the next chunk compresses better.
47*4882a593Smuzhiyun *
48*4882a593Smuzhiyun * We don't care too much about "excellent" compression ratio for large
49*4882a593Smuzhiyun * runlengths (all set/all clear): whether we achieve a factor of 100
50*4882a593Smuzhiyun * or 1000 is not that much of an issue.
51*4882a593Smuzhiyun * We do not want to waste too much on short runlengths in the "noisy"
52*4882a593Smuzhiyun * parts of the bitmap, though.
53*4882a593Smuzhiyun *
54*4882a593Smuzhiyun * There are endless variants of VLI, we experimented with:
55*4882a593Smuzhiyun * * simple byte-based
56*4882a593Smuzhiyun * * various bit based with different code word length.
57*4882a593Smuzhiyun *
58*4882a593Smuzhiyun * To avoid yet an other configuration parameter (choice of bitmap compression
59*4882a593Smuzhiyun * algorithm) which was difficult to explain and tune, we just chose the one
60*4882a593Smuzhiyun * variant that turned out best in all test cases.
61*4882a593Smuzhiyun * Based on real world usage patterns, with device sizes ranging from a few GiB
62*4882a593Smuzhiyun * to several TiB, file server/mailserver/webserver/mysql/postgress,
63*4882a593Smuzhiyun * mostly idle to really busy, the all time winner (though sometimes only
64*4882a593Smuzhiyun * marginally better) is:
65*4882a593Smuzhiyun */
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun * encoding is "visualised" as
69*4882a593Smuzhiyun * __little endian__ bitstream, least significant bit first (left most)
70*4882a593Smuzhiyun *
71*4882a593Smuzhiyun * this particular encoding is chosen so that the prefix code
72*4882a593Smuzhiyun * starts as unary encoding the level, then modified so that
73*4882a593Smuzhiyun * 10 levels can be described in 8bit, with minimal overhead
74*4882a593Smuzhiyun * for the smaller levels.
75*4882a593Smuzhiyun *
76*4882a593Smuzhiyun * Number of data bits follow fibonacci sequence, with the exception of the
77*4882a593Smuzhiyun * last level (+1 data bit, so it makes 64bit total). The only worse code when
78*4882a593Smuzhiyun * encoding bit polarity runlength is 1 plain bits => 2 code bits.
79*4882a593Smuzhiyun prefix data bits max val Nº data bits
80*4882a593Smuzhiyun 0 x 0x2 1
81*4882a593Smuzhiyun 10 x 0x4 1
82*4882a593Smuzhiyun 110 xx 0x8 2
83*4882a593Smuzhiyun 1110 xxx 0x10 3
84*4882a593Smuzhiyun 11110 xxx xx 0x30 5
85*4882a593Smuzhiyun 111110 xx xxxxxx 0x130 8
86*4882a593Smuzhiyun 11111100 xxxxxxxx xxxxx 0x2130 13
87*4882a593Smuzhiyun 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
88*4882a593Smuzhiyun 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
89*4882a593Smuzhiyun 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
90*4882a593Smuzhiyun * maximum encodable value: 0x100000400202130 == 2**56 + some */
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun /* compression "table":
93*4882a593Smuzhiyun transmitted x 0.29
94*4882a593Smuzhiyun as plaintext x ........................
95*4882a593Smuzhiyun x ........................
96*4882a593Smuzhiyun x ........................
97*4882a593Smuzhiyun x 0.59 0.21........................
98*4882a593Smuzhiyun x ........................................................
99*4882a593Smuzhiyun x .. c ...................................................
100*4882a593Smuzhiyun x 0.44.. o ...................................................
101*4882a593Smuzhiyun x .......... d ...................................................
102*4882a593Smuzhiyun x .......... e ...................................................
103*4882a593Smuzhiyun X............. ...................................................
104*4882a593Smuzhiyun x.............. b ...................................................
105*4882a593Smuzhiyun 2.0x............... i ...................................................
106*4882a593Smuzhiyun #X................ t ...................................................
107*4882a593Smuzhiyun #................. s ........................... plain bits ..........
108*4882a593Smuzhiyun -+-----------------------------------------------------------------------
109*4882a593Smuzhiyun 1 16 32 64
110*4882a593Smuzhiyun */
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun /* LEVEL: (total bits, prefix bits, prefix value),
113*4882a593Smuzhiyun * sorted ascending by number of total bits.
114*4882a593Smuzhiyun * The rest of the code table is calculated at compiletime from this. */
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun /* fibonacci data 1, 1, ... */
117*4882a593Smuzhiyun #define VLI_L_1_1() do { \
118*4882a593Smuzhiyun LEVEL( 2, 1, 0x00); \
119*4882a593Smuzhiyun LEVEL( 3, 2, 0x01); \
120*4882a593Smuzhiyun LEVEL( 5, 3, 0x03); \
121*4882a593Smuzhiyun LEVEL( 7, 4, 0x07); \
122*4882a593Smuzhiyun LEVEL(10, 5, 0x0f); \
123*4882a593Smuzhiyun LEVEL(14, 6, 0x1f); \
124*4882a593Smuzhiyun LEVEL(21, 8, 0x3f); \
125*4882a593Smuzhiyun LEVEL(29, 8, 0x7f); \
126*4882a593Smuzhiyun LEVEL(42, 8, 0xbf); \
127*4882a593Smuzhiyun LEVEL(64, 8, 0xff); \
128*4882a593Smuzhiyun } while (0)
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun /* finds a suitable level to decode the least significant part of in.
131*4882a593Smuzhiyun * returns number of bits consumed.
132*4882a593Smuzhiyun *
133*4882a593Smuzhiyun * BUG() for bad input, as that would mean a buggy code table. */
vli_decode_bits(u64 * out,const u64 in)134*4882a593Smuzhiyun static inline int vli_decode_bits(u64 *out, const u64 in)
135*4882a593Smuzhiyun {
136*4882a593Smuzhiyun u64 adj = 1;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun #define LEVEL(t,b,v) \
139*4882a593Smuzhiyun do { \
140*4882a593Smuzhiyun if ((in & ((1 << b) -1)) == v) { \
141*4882a593Smuzhiyun *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
142*4882a593Smuzhiyun return t; \
143*4882a593Smuzhiyun } \
144*4882a593Smuzhiyun adj += 1ULL << (t - b); \
145*4882a593Smuzhiyun } while (0)
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun VLI_L_1_1();
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun /* NOT REACHED, if VLI_LEVELS code table is defined properly */
150*4882a593Smuzhiyun BUG();
151*4882a593Smuzhiyun #undef LEVEL
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun /* return number of code bits needed,
155*4882a593Smuzhiyun * or negative error number */
__vli_encode_bits(u64 * out,const u64 in)156*4882a593Smuzhiyun static inline int __vli_encode_bits(u64 *out, const u64 in)
157*4882a593Smuzhiyun {
158*4882a593Smuzhiyun u64 max = 0;
159*4882a593Smuzhiyun u64 adj = 1;
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun if (in == 0)
162*4882a593Smuzhiyun return -EINVAL;
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun #define LEVEL(t,b,v) do { \
165*4882a593Smuzhiyun max += 1ULL << (t - b); \
166*4882a593Smuzhiyun if (in <= max) { \
167*4882a593Smuzhiyun if (out) \
168*4882a593Smuzhiyun *out = ((in - adj) << b) | v; \
169*4882a593Smuzhiyun return t; \
170*4882a593Smuzhiyun } \
171*4882a593Smuzhiyun adj = max + 1; \
172*4882a593Smuzhiyun } while (0)
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun VLI_L_1_1();
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun return -EOVERFLOW;
177*4882a593Smuzhiyun #undef LEVEL
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun #undef VLI_L_1_1
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun /* code from here down is independend of actually used bit code */
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun /*
185*4882a593Smuzhiyun * Code length is determined by some unique (e.g. unary) prefix.
186*4882a593Smuzhiyun * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
187*4882a593Smuzhiyun * not a byte stream.
188*4882a593Smuzhiyun */
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun /* for the bitstream, we need a cursor */
191*4882a593Smuzhiyun struct bitstream_cursor {
192*4882a593Smuzhiyun /* the current byte */
193*4882a593Smuzhiyun u8 *b;
194*4882a593Smuzhiyun /* the current bit within *b, nomalized: 0..7 */
195*4882a593Smuzhiyun unsigned int bit;
196*4882a593Smuzhiyun };
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun /* initialize cursor to point to first bit of stream */
bitstream_cursor_reset(struct bitstream_cursor * cur,void * s)199*4882a593Smuzhiyun static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun cur->b = s;
202*4882a593Smuzhiyun cur->bit = 0;
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun /* advance cursor by that many bits; maximum expected input value: 64,
206*4882a593Smuzhiyun * but depending on VLI implementation, it may be more. */
bitstream_cursor_advance(struct bitstream_cursor * cur,unsigned int bits)207*4882a593Smuzhiyun static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
208*4882a593Smuzhiyun {
209*4882a593Smuzhiyun bits += cur->bit;
210*4882a593Smuzhiyun cur->b = cur->b + (bits >> 3);
211*4882a593Smuzhiyun cur->bit = bits & 7;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun /* the bitstream itself knows its length */
215*4882a593Smuzhiyun struct bitstream {
216*4882a593Smuzhiyun struct bitstream_cursor cur;
217*4882a593Smuzhiyun unsigned char *buf;
218*4882a593Smuzhiyun size_t buf_len; /* in bytes */
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun /* for input stream:
221*4882a593Smuzhiyun * number of trailing 0 bits for padding
222*4882a593Smuzhiyun * total number of valid bits in stream: buf_len * 8 - pad_bits */
223*4882a593Smuzhiyun unsigned int pad_bits;
224*4882a593Smuzhiyun };
225*4882a593Smuzhiyun
bitstream_init(struct bitstream * bs,void * s,size_t len,unsigned int pad_bits)226*4882a593Smuzhiyun static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
227*4882a593Smuzhiyun {
228*4882a593Smuzhiyun bs->buf = s;
229*4882a593Smuzhiyun bs->buf_len = len;
230*4882a593Smuzhiyun bs->pad_bits = pad_bits;
231*4882a593Smuzhiyun bitstream_cursor_reset(&bs->cur, bs->buf);
232*4882a593Smuzhiyun }
233*4882a593Smuzhiyun
bitstream_rewind(struct bitstream * bs)234*4882a593Smuzhiyun static inline void bitstream_rewind(struct bitstream *bs)
235*4882a593Smuzhiyun {
236*4882a593Smuzhiyun bitstream_cursor_reset(&bs->cur, bs->buf);
237*4882a593Smuzhiyun memset(bs->buf, 0, bs->buf_len);
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
241*4882a593Smuzhiyun * Ignores "pad_bits".
242*4882a593Smuzhiyun * Returns zero if bits == 0 (nothing to do).
243*4882a593Smuzhiyun * Returns number of bits used if successful.
244*4882a593Smuzhiyun *
245*4882a593Smuzhiyun * If there is not enough room left in bitstream,
246*4882a593Smuzhiyun * leaves bitstream unchanged and returns -ENOBUFS.
247*4882a593Smuzhiyun */
bitstream_put_bits(struct bitstream * bs,u64 val,const unsigned int bits)248*4882a593Smuzhiyun static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
249*4882a593Smuzhiyun {
250*4882a593Smuzhiyun unsigned char *b = bs->cur.b;
251*4882a593Smuzhiyun unsigned int tmp;
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun if (bits == 0)
254*4882a593Smuzhiyun return 0;
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
257*4882a593Smuzhiyun return -ENOBUFS;
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun /* paranoia: strip off hi bits; they should not be set anyways. */
260*4882a593Smuzhiyun if (bits < 64)
261*4882a593Smuzhiyun val &= ~0ULL >> (64 - bits);
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun *b++ |= (val & 0xff) << bs->cur.bit;
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
266*4882a593Smuzhiyun *b++ |= (val >> tmp) & 0xff;
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun bitstream_cursor_advance(&bs->cur, bits);
269*4882a593Smuzhiyun return bits;
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
273*4882a593Smuzhiyun *
274*4882a593Smuzhiyun * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
275*4882a593Smuzhiyun *
276*4882a593Smuzhiyun * If there are less than the requested number of valid bits left in the
277*4882a593Smuzhiyun * bitstream, still fetches all available bits.
278*4882a593Smuzhiyun *
279*4882a593Smuzhiyun * Returns number of actually fetched bits.
280*4882a593Smuzhiyun */
bitstream_get_bits(struct bitstream * bs,u64 * out,int bits)281*4882a593Smuzhiyun static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
282*4882a593Smuzhiyun {
283*4882a593Smuzhiyun u64 val;
284*4882a593Smuzhiyun unsigned int n;
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun if (bits > 64)
287*4882a593Smuzhiyun return -EINVAL;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
290*4882a593Smuzhiyun bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
291*4882a593Smuzhiyun - bs->cur.bit - bs->pad_bits;
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun if (bits == 0) {
294*4882a593Smuzhiyun *out = 0;
295*4882a593Smuzhiyun return 0;
296*4882a593Smuzhiyun }
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun /* get the high bits */
299*4882a593Smuzhiyun val = 0;
300*4882a593Smuzhiyun n = (bs->cur.bit + bits + 7) >> 3;
301*4882a593Smuzhiyun /* n may be at most 9, if cur.bit + bits > 64 */
302*4882a593Smuzhiyun /* which means this copies at most 8 byte */
303*4882a593Smuzhiyun if (n) {
304*4882a593Smuzhiyun memcpy(&val, bs->cur.b+1, n - 1);
305*4882a593Smuzhiyun val = le64_to_cpu(val) << (8 - bs->cur.bit);
306*4882a593Smuzhiyun }
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun /* we still need the low bits */
309*4882a593Smuzhiyun val |= bs->cur.b[0] >> bs->cur.bit;
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun /* and mask out bits we don't want */
312*4882a593Smuzhiyun val &= ~0ULL >> (64 - bits);
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun bitstream_cursor_advance(&bs->cur, bits);
315*4882a593Smuzhiyun *out = val;
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun return bits;
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun /* encodes @in as vli into @bs;
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun * return values
323*4882a593Smuzhiyun * > 0: number of bits successfully stored in bitstream
324*4882a593Smuzhiyun * -ENOBUFS @bs is full
325*4882a593Smuzhiyun * -EINVAL input zero (invalid)
326*4882a593Smuzhiyun * -EOVERFLOW input too large for this vli code (invalid)
327*4882a593Smuzhiyun */
vli_encode_bits(struct bitstream * bs,u64 in)328*4882a593Smuzhiyun static inline int vli_encode_bits(struct bitstream *bs, u64 in)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun u64 code = code;
331*4882a593Smuzhiyun int bits = __vli_encode_bits(&code, in);
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun if (bits <= 0)
334*4882a593Smuzhiyun return bits;
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun return bitstream_put_bits(bs, code, bits);
337*4882a593Smuzhiyun }
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun #endif
340