drbd_vli.h | Explore in Territory

/* SPDX-License-Identifier: GPL-2.0-only */
/*
-*- linux-c -*-
   drbd_receiver.c
   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.

   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   Copyright (C) 1999-2008, Philipp Reisner <[email protected]>.
   Copyright (C) 2002-2008, Lars Ellenberg <[email protected]>.

 */

#ifndef _DRBD_VLI_H
#define _DRBD_VLI_H

/*
 * At a granularity of 4KiB storage represented per bit,
 * and stroage sizes of several TiB,
 * and possibly small-bandwidth replication,
 * the bitmap transfer time can take much too long,
 * if transmitted in plain text.
 *
 * We try to reduce the transferred bitmap information
 * by encoding runlengths of bit polarity.
 *
 * We never actually need to encode a "zero" (runlengths are positive).
 * But then we have to store the value of the first bit.
 * The first bit of information thus shall encode if the first runlength
 * gives the number of set or unset bits.
 *
 * We assume that large areas are either completely set or unset,
 * which gives good compression with any runlength method,
 * even when encoding the runlength as fixed size 32bit/64bit integers.
 *
 * Still, there may be areas where the polarity flips every few bits,
 * and encoding the runlength sequence of those areas with fix size
 * integers would be much worse than plaintext.
 *
 * We want to encode small runlength values with minimum code length,
 * while still being able to encode a Huge run of all zeros.
 *
 * Thus we need a Variable Length Integer encoding, VLI.
 *
 * For some cases, we produce more code bits than plaintext input.
 * We need to send incompressible chunks as plaintext, skip over them
 * and then see if the next chunk compresses better.
 *
 * We don't care too much about "excellent" compression ratio for large
 * runlengths (all set/all clear): whether we achieve a factor of 100
 * or 1000 is not that much of an issue.
 * We do not want to waste too much on short runlengths in the "noisy"
 * parts of the bitmap, though.
 *
 * There are endless variants of VLI, we experimented with:
 *  * simple byte-based
 *  * various bit based with different code word length.
 *
 * To avoid yet an other configuration parameter (choice of bitmap compression
 * algorithm) which was difficult to explain and tune, we just chose the one
 * variant that turned out best in all test cases.
 * Based on real world usage patterns, with device sizes ranging from a few GiB
 * to several TiB, file server/mailserver/webserver/mysql/postgress,
 * mostly idle to really busy, the all time winner (though sometimes only
 * marginally better) is:
 */

/*
 * encoding is "visualised" as
 * __little endian__ bitstream, least significant bit first (left most)
 *
 * this particular encoding is chosen so that the prefix code
 * starts as unary encoding the level, then modified so that
 * 10 levels can be described in 8bit, with minimal overhead
 * for the smaller levels.
 *
 * Number of data bits follow fibonacci sequence, with the exception of the
 * last level (+1 data bit, so it makes 64bit total).  The only worse code when
 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
prefix    data bits                                    max val  Nº data bits
0 x                                                         0x2            1
10 x                                                        0x4            1
110 xx                                                      0x8            2
1110 xxx                                                   0x10            3
11110 xxx xx                                               0x30            5
111110 xx xxxxxx                                          0x130            8
11111100  xxxxxxxx xxxxx                                 0x2130           13
11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
 * maximum encodable value: 0x100000400202130 == 2**56 + some */

/* compression "table":
 transmitted   x                                0.29
 as plaintext x                                  ........................
             x                                   ........................
            x                                    ........................
           x    0.59                         0.21........................
          x      ........................................................
         x       .. c ...................................................
        x    0.44.. o ...................................................
       x .......... d ...................................................
      x  .......... e ...................................................
     X.............   ...................................................
    x.............. b ...................................................
2.0x............... i ...................................................
 #X................ t ...................................................
 #................. s ...........................  plain bits  ..........
-+-----------------------------------------------------------------------
 1             16              32                              64
*/

/* LEVEL: (total bits, prefix bits, prefix value),
 * sorted ascending by number of total bits.
 * The rest of the code table is calculated at compiletime from this. */

/* fibonacci data 1, 1, ... */
#define VLI_L_1_1 …

/* finds a suitable level to decode the least significant part of in.
 * returns number of bits consumed.
 *
 * BUG() for bad input, as that would mean a buggy code table. */
static inline int vli_decode_bits(u64 *out, const u64 in)
{ … }

/* return number of code bits needed,
 * or negative error number */
static inline int __vli_encode_bits(u64 *out, const u64 in)
{ … }

#undef VLI_L_1_1

/* code from here down is independend of actually used bit code */

/*
 * Code length is determined by some unique (e.g. unary) prefix.
 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
 * not a byte stream.
 */

/* for the bitstream, we need a cursor */
struct bitstream_cursor { … };

/* initialize cursor to point to first bit of stream */
static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
{ … }

/* advance cursor by that many bits; maximum expected input value: 64,
 * but depending on VLI implementation, it may be more. */
static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
{ … }

/* the bitstream itself knows its length */
struct bitstream { … };

static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
{ … }

static inline void bitstream_rewind(struct bitstream *bs)
{ … }

/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
 * Ignores "pad_bits".
 * Returns zero if bits == 0 (nothing to do).
 * Returns number of bits used if successful.
 *
 * If there is not enough room left in bitstream,
 * leaves bitstream unchanged and returns -ENOBUFS.
 */
static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
{ … }

/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
 *
 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
 *
 * If there are less than the requested number of valid bits left in the
 * bitstream, still fetches all available bits.
 *
 * Returns number of actually fetched bits.
 */
static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
{ … }

/* encodes @in as vli into @bs;

 * return values
 *  > 0: number of bits successfully stored in bitstream
 * -ENOBUFS @bs is full
 * -EINVAL input zero (invalid)
 * -EOVERFLOW input too large for this vli code (invalid)
 */
static inline int vli_encode_bits(struct bitstream *bs, u64 in)
{ … }

#endif
linux/drivers/block/drbd/drbd_vli.h