block/drbd/drbd_vli.h

*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun-*- linux-c -*-
*4882a593Smuzhiyun   drbd_receiver.c
*4882a593Smuzhiyun   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
*4882a593Smuzhiyun
*4882a593Smuzhiyun   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
*4882a593Smuzhiyun   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
*4882a593Smuzhiyun   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
*4882a593Smuzhiyun
*4882a593Smuzhiyun */
*4882a593Smuzhiyun
*4882a593Smuzhiyun#ifndef _DRBD_VLI_H
*4882a593Smuzhiyun#define _DRBD_VLI_H
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * At a granularity of 4KiB storage represented per bit,
*4882a593Smuzhiyun * and stroage sizes of several TiB,
*4882a593Smuzhiyun * and possibly small-bandwidth replication,
*4882a593Smuzhiyun * the bitmap transfer time can take much too long,
*4882a593Smuzhiyun * if transmitted in plain text.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * We try to reduce the transferred bitmap information
*4882a593Smuzhiyun * by encoding runlengths of bit polarity.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * We never actually need to encode a "zero" (runlengths are positive).
*4882a593Smuzhiyun * But then we have to store the value of the first bit.
*4882a593Smuzhiyun * The first bit of information thus shall encode if the first runlength
*4882a593Smuzhiyun * gives the number of set or unset bits.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * We assume that large areas are either completely set or unset,
*4882a593Smuzhiyun * which gives good compression with any runlength method,
*4882a593Smuzhiyun * even when encoding the runlength as fixed size 32bit/64bit integers.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * Still, there may be areas where the polarity flips every few bits,
*4882a593Smuzhiyun * and encoding the runlength sequence of those areas with fix size
*4882a593Smuzhiyun * integers would be much worse than plaintext.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * We want to encode small runlength values with minimum code length,
*4882a593Smuzhiyun * while still being able to encode a Huge run of all zeros.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * Thus we need a Variable Length Integer encoding, VLI.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * For some cases, we produce more code bits than plaintext input.
*4882a593Smuzhiyun * We need to send incompressible chunks as plaintext, skip over them
*4882a593Smuzhiyun * and then see if the next chunk compresses better.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * We don't care too much about "excellent" compression ratio for large
*4882a593Smuzhiyun * runlengths (all set/all clear): whether we achieve a factor of 100
*4882a593Smuzhiyun * or 1000 is not that much of an issue.
*4882a593Smuzhiyun * We do not want to waste too much on short runlengths in the "noisy"
*4882a593Smuzhiyun * parts of the bitmap, though.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * There are endless variants of VLI, we experimented with:
*4882a593Smuzhiyun *  * simple byte-based
*4882a593Smuzhiyun *  * various bit based with different code word length.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * To avoid yet an other configuration parameter (choice of bitmap compression
*4882a593Smuzhiyun * algorithm) which was difficult to explain and tune, we just chose the one
*4882a593Smuzhiyun * variant that turned out best in all test cases.
*4882a593Smuzhiyun * Based on real world usage patterns, with device sizes ranging from a few GiB
*4882a593Smuzhiyun * to several TiB, file server/mailserver/webserver/mysql/postgress,
*4882a593Smuzhiyun * mostly idle to really busy, the all time winner (though sometimes only
*4882a593Smuzhiyun * marginally better) is:
*4882a593Smuzhiyun */
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * encoding is "visualised" as
*4882a593Smuzhiyun * __little endian__ bitstream, least significant bit first (left most)
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * this particular encoding is chosen so that the prefix code
*4882a593Smuzhiyun * starts as unary encoding the level, then modified so that
*4882a593Smuzhiyun * 10 levels can be described in 8bit, with minimal overhead
*4882a593Smuzhiyun * for the smaller levels.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * Number of data bits follow fibonacci sequence, with the exception of the
*4882a593Smuzhiyun * last level (+1 data bit, so it makes 64bit total).  The only worse code when
*4882a593Smuzhiyun * encoding bit polarity runlength is 1 plain bits => 2 code bits.
*4882a593Smuzhiyunprefix    data bits                                    max val  Nº data bits
*4882a593Smuzhiyun0 x                                                         0x2            1
*4882a593Smuzhiyun10 x                                                        0x4            1
*4882a593Smuzhiyun110 xx                                                      0x8            2
*4882a593Smuzhiyun1110 xxx                                                   0x10            3
*4882a593Smuzhiyun11110 xxx xx                                               0x30            5
*4882a593Smuzhiyun111110 xx xxxxxx                                          0x130            8
*4882a593Smuzhiyun11111100  xxxxxxxx xxxxx                                 0x2130           13
*4882a593Smuzhiyun11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
*4882a593Smuzhiyun11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
*4882a593Smuzhiyun11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
*4882a593Smuzhiyun * maximum encodable value: 0x100000400202130 == 2**56 + some */
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* compression "table":
*4882a593Smuzhiyun transmitted   x                                0.29
*4882a593Smuzhiyun as plaintext x                                  ........................
*4882a593Smuzhiyun             x                                   ........................
*4882a593Smuzhiyun            x                                    ........................
*4882a593Smuzhiyun           x    0.59                         0.21........................
*4882a593Smuzhiyun          x      ........................................................
*4882a593Smuzhiyun         x       .. c ...................................................
*4882a593Smuzhiyun        x    0.44.. o ...................................................
*4882a593Smuzhiyun       x .......... d ...................................................
*4882a593Smuzhiyun      x  .......... e ...................................................
*4882a593Smuzhiyun     X.............   ...................................................
*4882a593Smuzhiyun    x.............. b ...................................................
*4882a593Smuzhiyun2.0x............... i ...................................................
*4882a593Smuzhiyun #X................ t ...................................................
*4882a593Smuzhiyun #................. s ...........................  plain bits  ..........
*4882a593Smuzhiyun-+-----------------------------------------------------------------------
*4882a593Smuzhiyun 1             16              32                              64
*4882a593Smuzhiyun*/
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* LEVEL: (total bits, prefix bits, prefix value),
*4882a593Smuzhiyun * sorted ascending by number of total bits.
*4882a593Smuzhiyun * The rest of the code table is calculated at compiletime from this. */
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* fibonacci data 1, 1, ... */
*4882a593Smuzhiyun#define VLI_L_1_1() do { \
*4882a593Smuzhiyun	LEVEL( 2, 1, 0x00); \
*4882a593Smuzhiyun	LEVEL( 3, 2, 0x01); \
*4882a593Smuzhiyun	LEVEL( 5, 3, 0x03); \
*4882a593Smuzhiyun	LEVEL( 7, 4, 0x07); \
*4882a593Smuzhiyun	LEVEL(10, 5, 0x0f); \
*4882a593Smuzhiyun	LEVEL(14, 6, 0x1f); \
*4882a593Smuzhiyun	LEVEL(21, 8, 0x3f); \
*4882a593Smuzhiyun	LEVEL(29, 8, 0x7f); \
*4882a593Smuzhiyun	LEVEL(42, 8, 0xbf); \
*4882a593Smuzhiyun	LEVEL(64, 8, 0xff); \
*4882a593Smuzhiyun	} while (0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* finds a suitable level to decode the least significant part of in.
*4882a593Smuzhiyun * returns number of bits consumed.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * BUG() for bad input, as that would mean a buggy code table. */
*4882a593Smuzhiyunstatic inline int vli_decode_bits(u64 *out, const u64 in)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	u64 adj = 1;
*4882a593Smuzhiyun
*4882a593Smuzhiyun#define LEVEL(t,b,v)					\
*4882a593Smuzhiyun	do {						\
*4882a593Smuzhiyun		if ((in & ((1 << b) -1)) == v) {	\
*4882a593Smuzhiyun			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
*4882a593Smuzhiyun			return t;			\
*4882a593Smuzhiyun		}					\
*4882a593Smuzhiyun		adj += 1ULL << (t - b);			\
*4882a593Smuzhiyun	} while (0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	VLI_L_1_1();
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
*4882a593Smuzhiyun	BUG();
*4882a593Smuzhiyun#undef LEVEL
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* return number of code bits needed,
*4882a593Smuzhiyun * or negative error number */
*4882a593Smuzhiyunstatic inline int __vli_encode_bits(u64 *out, const u64 in)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	u64 max = 0;
*4882a593Smuzhiyun	u64 adj = 1;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (in == 0)
*4882a593Smuzhiyun		return -EINVAL;
*4882a593Smuzhiyun
*4882a593Smuzhiyun#define LEVEL(t,b,v) do {		\
*4882a593Smuzhiyun		max += 1ULL << (t - b);	\
*4882a593Smuzhiyun		if (in <= max) {	\
*4882a593Smuzhiyun			if (out)	\
*4882a593Smuzhiyun				*out = ((in - adj) << b) | v;	\
*4882a593Smuzhiyun			return t;	\
*4882a593Smuzhiyun		}			\
*4882a593Smuzhiyun		adj = max + 1;		\
*4882a593Smuzhiyun	} while (0)
*4882a593Smuzhiyun
*4882a593Smuzhiyun	VLI_L_1_1();
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return -EOVERFLOW;
*4882a593Smuzhiyun#undef LEVEL
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun#undef VLI_L_1_1
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* code from here down is independend of actually used bit code */
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * Code length is determined by some unique (e.g. unary) prefix.
*4882a593Smuzhiyun * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
*4882a593Smuzhiyun * not a byte stream.
*4882a593Smuzhiyun */
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* for the bitstream, we need a cursor */
*4882a593Smuzhiyunstruct bitstream_cursor {
*4882a593Smuzhiyun	/* the current byte */
*4882a593Smuzhiyun	u8 *b;
*4882a593Smuzhiyun	/* the current bit within *b, nomalized: 0..7 */
*4882a593Smuzhiyun	unsigned int bit;
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* initialize cursor to point to first bit of stream */
*4882a593Smuzhiyunstatic inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	cur->b = s;
*4882a593Smuzhiyun	cur->bit = 0;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* advance cursor by that many bits; maximum expected input value: 64,
*4882a593Smuzhiyun * but depending on VLI implementation, it may be more. */
*4882a593Smuzhiyunstatic inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	bits += cur->bit;
*4882a593Smuzhiyun	cur->b = cur->b + (bits >> 3);
*4882a593Smuzhiyun	cur->bit = bits & 7;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* the bitstream itself knows its length */
*4882a593Smuzhiyunstruct bitstream {
*4882a593Smuzhiyun	struct bitstream_cursor cur;
*4882a593Smuzhiyun	unsigned char *buf;
*4882a593Smuzhiyun	size_t buf_len;		/* in bytes */
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* for input stream:
*4882a593Smuzhiyun	 * number of trailing 0 bits for padding
*4882a593Smuzhiyun	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
*4882a593Smuzhiyun	unsigned int pad_bits;
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	bs->buf = s;
*4882a593Smuzhiyun	bs->buf_len = len;
*4882a593Smuzhiyun	bs->pad_bits = pad_bits;
*4882a593Smuzhiyun	bitstream_cursor_reset(&bs->cur, bs->buf);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline void bitstream_rewind(struct bitstream *bs)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	bitstream_cursor_reset(&bs->cur, bs->buf);
*4882a593Smuzhiyun	memset(bs->buf, 0, bs->buf_len);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
*4882a593Smuzhiyun * Ignores "pad_bits".
*4882a593Smuzhiyun * Returns zero if bits == 0 (nothing to do).
*4882a593Smuzhiyun * Returns number of bits used if successful.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * If there is not enough room left in bitstream,
*4882a593Smuzhiyun * leaves bitstream unchanged and returns -ENOBUFS.
*4882a593Smuzhiyun */
*4882a593Smuzhiyunstatic inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	unsigned char *b = bs->cur.b;
*4882a593Smuzhiyun	unsigned int tmp;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (bits == 0)
*4882a593Smuzhiyun		return 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
*4882a593Smuzhiyun		return -ENOBUFS;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* paranoia: strip off hi bits; they should not be set anyways. */
*4882a593Smuzhiyun	if (bits < 64)
*4882a593Smuzhiyun		val &= ~0ULL >> (64 - bits);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	*b++ |= (val & 0xff) << bs->cur.bit;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
*4882a593Smuzhiyun		*b++ |= (val >> tmp) & 0xff;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bitstream_cursor_advance(&bs->cur, bits);
*4882a593Smuzhiyun	return bits;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * If there are less than the requested number of valid bits left in the
*4882a593Smuzhiyun * bitstream, still fetches all available bits.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * Returns number of actually fetched bits.
*4882a593Smuzhiyun */
*4882a593Smuzhiyunstatic inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	u64 val;
*4882a593Smuzhiyun	unsigned int n;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (bits > 64)
*4882a593Smuzhiyun		return -EINVAL;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
*4882a593Smuzhiyun		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
*4882a593Smuzhiyun			- bs->cur.bit - bs->pad_bits;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (bits == 0) {
*4882a593Smuzhiyun		*out = 0;
*4882a593Smuzhiyun		return 0;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* get the high bits */
*4882a593Smuzhiyun	val = 0;
*4882a593Smuzhiyun	n = (bs->cur.bit + bits + 7) >> 3;
*4882a593Smuzhiyun	/* n may be at most 9, if cur.bit + bits > 64 */
*4882a593Smuzhiyun	/* which means this copies at most 8 byte */
*4882a593Smuzhiyun	if (n) {
*4882a593Smuzhiyun		memcpy(&val, bs->cur.b+1, n - 1);
*4882a593Smuzhiyun		val = le64_to_cpu(val) << (8 - bs->cur.bit);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* we still need the low bits */
*4882a593Smuzhiyun	val |= bs->cur.b[0] >> bs->cur.bit;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* and mask out bits we don't want */
*4882a593Smuzhiyun	val &= ~0ULL >> (64 - bits);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bitstream_cursor_advance(&bs->cur, bits);
*4882a593Smuzhiyun	*out = val;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return bits;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/* encodes @in as vli into @bs;
*4882a593Smuzhiyun
*4882a593Smuzhiyun * return values
*4882a593Smuzhiyun *  > 0: number of bits successfully stored in bitstream
*4882a593Smuzhiyun * -ENOBUFS @bs is full
*4882a593Smuzhiyun * -EINVAL input zero (invalid)
*4882a593Smuzhiyun * -EOVERFLOW input too large for this vli code (invalid)
*4882a593Smuzhiyun */
*4882a593Smuzhiyunstatic inline int vli_encode_bits(struct bitstream *bs, u64 in)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	u64 code = code;
*4882a593Smuzhiyun	int bits = __vli_encode_bits(&code, in);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (bits <= 0)
*4882a593Smuzhiyun		return bits;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return bitstream_put_bits(bs, code, bits);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun#endif