4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
29 * At a granularity of 4KiB storage represented per bit,
30 * and stroage sizes of several TiB,
31 * and possibly small-bandwidth replication,
32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text.
35 * We try to reduce the transfered bitmap information
36 * by encoding runlengths of bit polarity.
38 * We never actually need to encode a "zero" (runlengths are positive).
39 * But then we have to store the value of the first bit.
40 * The first bit of information thus shall encode if the first runlength
41 * gives the number of set or unset bits.
43 * We assume that large areas are either completely set or unset,
44 * which gives good compression with any runlength method,
45 * even when encoding the runlength as fixed size 32bit/64bit integers.
47 * Still, there may be areas where the polarity flips every few bits,
48 * and encoding the runlength sequence of those areas with fix size
49 * integers would be much worse than plaintext.
51 * We want to encode small runlength values with minimum code length,
52 * while still being able to encode a Huge run of all zeros.
54 * Thus we need a Variable Length Integer encoding, VLI.
56 * For some cases, we produce more code bits than plaintext input.
57 * We need to send incompressible chunks as plaintext, skip over them
58 * and then see if the next chunk compresses better.
60 * We don't care too much about "excellent" compression ratio for large
61 * runlengths (all set/all clear): whether we achieve a factor of 100
62 * or 1000 is not that much of an issue.
63 * We do not want to waste too much on short runlengths in the "noisy"
64 * parts of the bitmap, though.
66 * There are endless variants of VLI, we experimented with:
68 * * various bit based with different code word length.
70 * To avoid yet an other configuration parameter (choice of bitmap compression
71 * algorithm) which was difficult to explain and tune, we just chose the one
72 * variant that turned out best in all test cases.
73 * Based on real world usage patterns, with device sizes ranging from a few GiB
74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
75 * mostly idle to really busy, the all time winner (though sometimes only
76 * marginally better) is:
80 /* compression "table":
82 as plaintext x ........................
83 x ........................
84 x ........................
85 x 0.59 0.21........................
86 x ........................................................
87 x .. c ...................................................
88 x 0.44.. o ...................................................
89 x .......... d ...................................................
90 x .......... e ...................................................
91 X............. ...................................................
92 x.............. b ...................................................
93 2.0x............... i ...................................................
94 #X................ t ...................................................
95 #................. s ........................... plain bits ..........
96 -+-----------------------------------------------------------------------
100 /* LEVEL: (total bits, prefix bits, prefix value),
101 * sorted ascending by number of total bits.
102 * The rest of the code table is calculated at compiletime from this. */
104 /* fibonacci data 1, 1, ... */
105 #define VLI_L_1_1() do { \
106 LEVEL( 2, 1, 0x00); \
107 LEVEL( 3, 2, 0x01); \
108 LEVEL( 5, 3, 0x03); \
109 LEVEL( 7, 4, 0x07); \
110 LEVEL(10, 5, 0x0f); \
111 LEVEL(14, 6, 0x1f); \
112 LEVEL(21, 8, 0x3f); \
113 LEVEL(29, 8, 0x7f); \
114 LEVEL(42, 8, 0xbf); \
115 LEVEL(64, 8, 0xff); \
118 /* finds a suitable level to decode the least significant part of in.
119 * returns number of bits consumed.
121 * BUG() for bad input, as that would mean a buggy code table. */
122 static inline int vli_decode_bits(u64
*out
, const u64 in
)
126 #define LEVEL(t,b,v) \
128 if ((in & ((1 << b) -1)) == v) { \
129 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
132 adj += 1ULL << (t - b); \
137 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
142 /* return number of code bits needed,
143 * or negative error number */
144 static inline int __vli_encode_bits(u64
*out
, const u64 in
)
152 #define LEVEL(t,b,v) do { \
153 max += 1ULL << (t - b); \
156 *out = ((in - adj) << b) | v; \
170 /* code from here down is independend of actually used bit code */
173 * Code length is determined by some unique (e.g. unary) prefix.
174 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
178 /* for the bitstream, we need a cursor */
179 struct bitstream_cursor
{
180 /* the current byte */
182 /* the current bit within *b, nomalized: 0..7 */
186 /* initialize cursor to point to first bit of stream */
187 static inline void bitstream_cursor_reset(struct bitstream_cursor
*cur
, void *s
)
193 /* advance cursor by that many bits; maximum expected input value: 64,
194 * but depending on VLI implementation, it may be more. */
195 static inline void bitstream_cursor_advance(struct bitstream_cursor
*cur
, unsigned int bits
)
198 cur
->b
= cur
->b
+ (bits
>> 3);
202 /* the bitstream itself knows its length */
204 struct bitstream_cursor cur
;
206 size_t buf_len
; /* in bytes */
209 * number of trailing 0 bits for padding
210 * total number of valid bits in stream: buf_len * 8 - pad_bits */
211 unsigned int pad_bits
;
214 static inline void bitstream_init(struct bitstream
*bs
, void *s
, size_t len
, unsigned int pad_bits
)
218 bs
->pad_bits
= pad_bits
;
219 bitstream_cursor_reset(&bs
->cur
, bs
->buf
);
222 static inline void bitstream_rewind(struct bitstream
*bs
)
224 bitstream_cursor_reset(&bs
->cur
, bs
->buf
);
225 memset(bs
->buf
, 0, bs
->buf_len
);
228 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
229 * Ignores "pad_bits".
230 * Returns zero if bits == 0 (nothing to do).
231 * Returns number of bits used if successful.
233 * If there is not enough room left in bitstream,
234 * leaves bitstream unchanged and returns -ENOBUFS.
236 static inline int bitstream_put_bits(struct bitstream
*bs
, u64 val
, const unsigned int bits
)
238 unsigned char *b
= bs
->cur
.b
;
244 if ((bs
->cur
.b
+ ((bs
->cur
.bit
+ bits
-1) >> 3)) - bs
->buf
>= bs
->buf_len
)
247 /* paranoia: strip off hi bits; they should not be set anyways. */
249 val
&= ~0ULL >> (64 - bits
);
251 *b
++ |= (val
& 0xff) << bs
->cur
.bit
;
253 for (tmp
= 8 - bs
->cur
.bit
; tmp
< bits
; tmp
+= 8)
254 *b
++ |= (val
>> tmp
) & 0xff;
256 bitstream_cursor_advance(&bs
->cur
, bits
);
260 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
262 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
264 * If there are less than the requested number of valid bits left in the
265 * bitstream, still fetches all available bits.
267 * Returns number of actually fetched bits.
269 static inline int bitstream_get_bits(struct bitstream
*bs
, u64
*out
, int bits
)
277 if (bs
->cur
.b
+ ((bs
->cur
.bit
+ bs
->pad_bits
+ bits
-1) >> 3) - bs
->buf
>= bs
->buf_len
)
278 bits
= ((bs
->buf_len
- (bs
->cur
.b
- bs
->buf
)) << 3)
279 - bs
->cur
.bit
- bs
->pad_bits
;
286 /* get the high bits */
288 n
= (bs
->cur
.bit
+ bits
+ 7) >> 3;
289 /* n may be at most 9, if cur.bit + bits > 64 */
290 /* which means this copies at most 8 byte */
292 memcpy(&val
, bs
->cur
.b
+1, n
- 1);
293 val
= le64_to_cpu(val
) << (8 - bs
->cur
.bit
);
296 /* we still need the low bits */
297 val
|= bs
->cur
.b
[0] >> bs
->cur
.bit
;
299 /* and mask out bits we don't want */
300 val
&= ~0ULL >> (64 - bits
);
302 bitstream_cursor_advance(&bs
->cur
, bits
);
308 /* encodes @in as vli into @bs;
311 * > 0: number of bits successfully stored in bitstream
312 * -ENOBUFS @bs is full
313 * -EINVAL input zero (invalid)
314 * -EOVERFLOW input too large for this vli code (invalid)
316 static inline int vli_encode_bits(struct bitstream
*bs
, u64 in
)
319 int bits
= __vli_encode_bits(&code
, in
);
324 return bitstream_put_bits(bs
, code
, bits
);