s3:printing: Allow to run samba-bgqd as a standalone systemd service
[Samba.git] / lib / compression / lzxpress_huffman.c
blob63b5ffae8eccc8660317b5219a54e8b9610bddec
1 /*
2 * Samba compression library - LGPLv3
4 * Copyright © Catalyst IT 2022
6 * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
7 * and Jo Sutton <josutton@catalyst.net.nz>
9 * ** NOTE! The following LGPL license applies to this file.
10 * ** It does NOT imply that all of Samba is released under the LGPL
12 * This library is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 3 of the License, or (at your option) any later version.
17 * This library is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 #include <talloc.h>
28 #include "replace.h"
29 #include "lzxpress_huffman.h"
30 #include "lib/util/stable_sort.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/byteorder.h"
33 #include "lib/util/bytearray.h"
36 * DEBUG_NO_LZ77_MATCHES toggles the encoding of matches as matches. If it is
37 * false the potential match is written as a series of literals, which is a
38 * valid but usually inefficient encoding. This is useful for isolating a
39 * problem to either the LZ77 or the Huffman stage.
41 #ifndef DEBUG_NO_LZ77_MATCHES
42 #define DEBUG_NO_LZ77_MATCHES false
43 #endif
46 * DEBUG_HUFFMAN_TREE forces the drawing of ascii art huffman trees during
47 * compression and decompression.
49 * These trees will also be drawn at DEBUG level 10, but that doesn't work
50 * with cmocka tests.
52 #ifndef DEBUG_HUFFMAN_TREE
53 #define DEBUG_HUFFMAN_TREE false
54 #endif
56 #if DEBUG_HUFFMAN_TREE
57 #define DBG(...) fprintf(stderr, __VA_ARGS__)
58 #else
59 #define DBG(...) DBG_INFO(__VA_ARGS__)
60 #endif
63 #define LZXPRESS_ERROR -1LL
66 * We won't encode a match length longer than MAX_MATCH_LENGTH.
68 * Reports are that Windows has a limit at 64M.
70 #define MAX_MATCH_LENGTH (64 * 1024 * 1024)
73 struct bitstream {
74 const uint8_t *bytes;
75 size_t byte_pos;
76 size_t byte_size;
77 uint32_t bits;
78 int remaining_bits;
79 uint16_t *table;
83 #if ! defined __has_builtin
84 #define __has_builtin(x) 0
85 #endif
88 * bitlen_nonzero_16() returns the bit number of the most significant bit, or
89 * put another way, the integer log base 2. Log(0) is undefined; the argument
90 * has to be non-zero!
91 * 1 -> 0
92 * 2,3 -> 1
93 * 4-7 -> 2
94 * 1024 -> 10, etc
96 * Probably this is handled by a compiler intrinsic function that maps to a
97 * dedicated machine instruction.
100 static inline int bitlen_nonzero_16(uint16_t x)
102 #if __has_builtin(__builtin_clz)
104 /* __builtin_clz returns the number of leading zeros */
105 return (sizeof(unsigned int) * CHAR_BIT) - 1
106 - __builtin_clz((unsigned int) x);
108 #else
110 int count = -1;
111 while(x) {
112 x >>= 1;
113 count++;
115 return count;
117 #endif
121 struct lzxhuff_compressor_context {
122 const uint8_t *input_bytes;
123 size_t input_size;
124 size_t input_pos;
125 size_t prev_block_pos;
126 uint8_t *output;
127 size_t available_size;
128 size_t output_pos;
131 static int compare_huffman_node_count(struct huffman_node *a,
132 struct huffman_node *b)
134 return a->count - b->count;
137 static int compare_huffman_node_depth(struct huffman_node *a,
138 struct huffman_node *b)
140 int c = a->depth - b->depth;
141 if (c != 0) {
142 return c;
144 return (int)a->symbol - (int)b->symbol;
148 #define HASH_MASK ((1 << LZX_HUFF_COMP_HASH_BITS) - 1)
150 static inline uint16_t three_byte_hash(const uint8_t *bytes)
153 * MS-XCA says "three byte hash", but does not specify it.
155 * This one is just cobbled together, but has quite good distribution
156 * in the 12-14 bit forms, which is what we care about most.
157 * e.g: 13 bit: median 2048, min 2022, max 2074, stddev 6.0
159 uint16_t a = bytes[0];
160 uint16_t b = bytes[1] ^ 0x2e;
161 uint16_t c = bytes[2] ^ 0x55;
162 uint16_t ca = c - a;
163 uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a);
164 return d & HASH_MASK;
168 static inline uint16_t encode_match(size_t len, size_t offset)
170 uint16_t code = 256;
171 code |= MIN(len - 3, 15);
172 code |= bitlen_nonzero_16(offset) << 4;
173 return code;
177 * debug_huffman_tree() uses debug_huffman_tree_print() to draw the Huffman
178 * tree in ascii art.
180 * Note that the Huffman tree is probably not the same as that implied by the
181 * canonical Huffman encoding that is finally used. That tree would be the
182 * same shape, but with the left and right toggled to sort the branches by
183 * length, after which the symbols for each length sorted by value.
186 static void debug_huffman_tree_print(struct huffman_node *node,
187 int *trail, int depth)
189 if (node->left == NULL) {
190 /* time to print a row */
191 int j;
192 bool branched = false;
193 int row[17];
194 char c[100];
195 int s = node->symbol;
196 char code[17];
197 if (depth > 15) {
198 fprintf(stderr,
199 " \033[1;31m Max depth exceeded! (%d)\033[0m "
200 " symbol %#3x claimed depth %d count %d\n",
201 depth, node->symbol, node->depth, node->count);
202 return;
204 for (j = depth - 1; j >= 0; j--) {
205 if (branched) {
206 if (trail[j] == -1) {
207 row[j] = -3;
208 } else {
209 row[j] = -2;
211 } else if (trail[j] == -1) {
212 row[j] = -1;
213 branched = true;
214 } else {
215 row[j] = trail[j];
218 for (j = 0; j < depth; j++) {
219 switch (row[j]) {
220 case -3:
221 code[j] = '1';
222 fprintf(stderr, " ");
223 break;
224 case -2:
225 code[j] = '0';
226 fprintf(stderr, " │ ");
227 break;
228 case -1:
229 code[j] = '1';
230 fprintf(stderr, " ╰─");
231 break;
232 default:
233 code[j] = '0';
234 fprintf(stderr, "%5d─┬─", row[j]);
235 break;
238 code[depth] = 0;
239 if (s < 32) {
240 snprintf(c, sizeof(c),
241 "\033[1;32m%02x\033[0m \033[1;33m%c%c%c\033[0m",
243 0xE2, 0x90, 0x80 + s); /* utf-8 for symbol */
244 } else if (s < 127) {
245 snprintf(c, sizeof(c),
246 "\033[1;32m%2x\033[0m '\033[10;32m%c\033[0m'",
247 s, s);
248 } else if (s < 256) {
249 snprintf(c, sizeof(c), "\033[1;32m%2x\033[0m", s);
250 } else {
251 uint16_t len = (s & 15) + 3;
252 uint16_t dbits = ((s >> 4) & 15) + 1;
253 snprintf(c, sizeof(c),
254 " \033[0;33mlen:%2d%s, "
255 "dist:%d-%d \033[0m \033[1;32m%3x\033[0m%s",
256 len,
257 len == 18 ? "+" : "",
258 1 << (dbits - 1),
259 (1 << dbits) - 1,
261 s == 256 ? " \033[1;31mEOF\033[0m" : "");
265 fprintf(stderr, "──%5d %s \033[2;37m%s\033[0m\n",
266 node->count, c, code);
267 return;
269 trail[depth] = node->count;
270 debug_huffman_tree_print(node->left, trail, depth + 1);
271 trail[depth] = -1;
272 debug_huffman_tree_print(node->right, trail, depth + 1);
277 * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree()
278 * will print a tree looking something like this:
280 * 7─┬─── 3 len:18+, dist:1-1 10f 0
281 * ╰─ 4─┬─ 2─┬─── 1 61 'a' 100
282 * │ ╰─── 1 62 'b' 101
283 * ╰─ 2─┬─── 1 63 'c' 110
284 * ╰─── 1 len: 3, dist:1-1 100 EOF 111
286 * This is based off a Huffman root node, and the tree may not be the same as
287 * the canonical tree.
289 static void debug_huffman_tree(struct huffman_node *root)
291 int trail[17];
292 debug_huffman_tree_print(root, trail, 0);
297 * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree_from_table()
298 * will print something like this based on a decoding symbol table.
300 * Tree from decoding table 9 nodes → 5 codes
301 * 10000─┬─── 5000 len:18+, dist:1-1 10f 0
302 * ╰─ 5000─┬─ 2500─┬─── 1250 61 'a' 100
303 * │ ╰─── 1250 62 'b' 101
304 * ╰─ 2500─┬─── 1250 63 'c' 110
305 * ╰─── 1250 len: 3, dist:1-1 100 EOF 111
307 * This is the canonical form of the Huffman tree where the actual counts
308 * aren't known (we use "10000" to help indicate relative frequencies).
310 static void debug_huffman_tree_from_table(uint16_t *table)
312 int trail[17];
313 struct huffman_node nodes[1024] = {{0}};
314 uint16_t codes[1024];
315 size_t n = 1;
316 size_t i = 0;
317 codes[0] = 0;
318 nodes[0].count = 10000;
320 while (i < n) {
321 uint16_t index = codes[i];
322 struct huffman_node *node = &nodes[i];
323 if (table[index] == 0xffff) {
324 /* internal node */
325 index <<= 1;
326 /* left */
327 index++;
328 codes[n] = index;
329 node->left = nodes + n;
330 nodes[n].count = node->count >> 1;
331 n++;
332 /*right*/
333 index++;
334 codes[n] = index;
335 node->right = nodes + n;
336 nodes[n].count = node->count >> 1;
337 n++;
338 } else {
339 /* leaf node */
340 node->symbol = table[index] & 511;
342 i++;
345 fprintf(stderr,
346 "\033[1;34m Tree from decoding table\033[0m "
347 "%zu nodes → %zu codes\n",
348 n, (n + 1) / 2);
349 debug_huffman_tree_print(nodes, trail, 0);
353 static bool depth_walk(struct huffman_node *n, uint32_t depth)
355 bool ok;
356 if (n->left == NULL) {
357 /* this is a leaf, record the depth */
358 n->depth = depth;
359 return true;
361 if (depth > 14) {
362 return false;
364 ok = (depth_walk(n->left, depth + 1) &&
365 depth_walk(n->right, depth + 1));
367 return ok;
371 static bool check_and_record_depths(struct huffman_node *root)
373 return depth_walk(root, 0);
377 static bool encode_values(struct huffman_node *leaves,
378 size_t n_leaves,
379 uint16_t symbol_values[512])
381 size_t i;
383 * See, we have a leading 1 in our internal code representation, which
384 * indicates the code length.
386 uint32_t code = 1;
387 uint32_t code_len = 0;
388 memset(symbol_values, 0, sizeof(uint16_t) * 512);
389 for (i = 0; i < n_leaves; i++) {
390 code <<= leaves[i].depth - code_len;
391 code_len = leaves[i].depth;
393 symbol_values[leaves[i].symbol] = code;
394 code++;
397 * The last code should be 11111... with code_len + 1 ones. The final
398 * code++ will wrap this round to 1000... with code_len + 1 zeroes.
401 if (code != 2 << code_len) {
402 return false;
404 return true;
408 static int generate_huffman_codes(struct huffman_node *leaf_nodes,
409 struct huffman_node *internal_nodes,
410 uint16_t symbol_values[512])
412 size_t head_leaf = 0;
413 size_t head_branch = 0;
414 size_t tail_branch = 0;
415 struct huffman_node *huffman_root = NULL;
416 size_t i, j;
417 size_t n_leaves = 0;
420 * Before we sort the nodes, we can eliminate the unused ones.
422 for (i = 0; i < 512; i++) {
423 if (leaf_nodes[i].count) {
424 leaf_nodes[n_leaves] = leaf_nodes[i];
425 n_leaves++;
428 if (n_leaves == 0) {
429 return LZXPRESS_ERROR;
431 if (n_leaves == 1) {
433 * There is *almost* no way this should happen, and it would
434 * ruin the tree (because the shortest possible codes are 1
435 * bit long, and there are two of them).
437 * The only way to get here is in an internal block in a
438 * 3-or-more block message (i.e. > 128k), which consists
439 * entirely of a match starting in the previous block (if it
440 * was the end block, it would have the EOF symbol).
442 * What we do is add a dummy symbol which is this one XOR 256.
443 * It won't be used in the stream but will balance the tree.
445 leaf_nodes[1] = leaf_nodes[0];
446 leaf_nodes[1].symbol ^= 0x100;
447 n_leaves = 2;
450 /* note, in sort we're using internal_nodes as auxiliary space */
451 stable_sort(leaf_nodes,
452 internal_nodes,
453 n_leaves,
454 sizeof(struct huffman_node),
455 (samba_compare_fn_t)compare_huffman_node_count);
458 * This outer loop is for re-quantizing the counts if the tree is too
459 * tall (>15), which we need to do because the final encoding can't
460 * express a tree that deep.
462 * In theory, this should be a 'while (true)' loop, but we chicken
463 * out with 10 iterations, just in case.
465 * In practice it will almost always resolve in the first round; if
466 * not then, in the second or third. Remember we'll looking at 64k or
467 * less, so the rarest we can have is 1 in 64k; each round of
468 * quantization effectively doubles its frequency to 1 in 32k, 1 in
469 * 16k, etc, until we're treating the rare symbol as actually quite
470 * common.
472 for (j = 0; j < 10; j++) {
473 bool less_than_15_bits;
474 while (true) {
475 struct huffman_node *a = NULL;
476 struct huffman_node *b = NULL;
477 size_t leaf_len = n_leaves - head_leaf;
478 size_t internal_len = tail_branch - head_branch;
480 if (leaf_len + internal_len == 1) {
482 * We have the complete tree. The root will be
483 * an internal node unless there is just one
484 * symbol, which is already impossible.
486 if (unlikely(leaf_len == 1)) {
487 return LZXPRESS_ERROR;
488 } else {
489 huffman_root = \
490 &internal_nodes[head_branch];
492 break;
495 * We know here we have at least two nodes, and we
496 * want to select the two lowest scoring ones. Those
497 * have to be either a) the head of each queue, or b)
498 * the first two nodes of either queue.
500 * The complicating factors are: a) we need to check
501 * the length of each queue, and b) in the case of
502 * ties, we prefer to pair leaves with leaves.
504 * Note a complication we don't have: the leaf node
505 * queue never grows, and the subtree queue starts
506 * empty and cannot grow beyond n - 1. It feeds on
507 * itself. We don't need to think about overflow.
509 if (leaf_len == 0) {
510 /* two from subtrees */
511 a = &internal_nodes[head_branch];
512 b = &internal_nodes[head_branch + 1];
513 head_branch += 2;
514 } else if (internal_len == 0) {
515 /* two from nodes */
516 a = &leaf_nodes[head_leaf];
517 b = &leaf_nodes[head_leaf + 1];
518 head_leaf += 2;
519 } else if (leaf_len == 1 && internal_len == 1) {
520 /* one of each */
521 a = &leaf_nodes[head_leaf];
522 b = &internal_nodes[head_branch];
523 head_branch++;
524 head_leaf++;
525 } else {
527 * Take the lowest head, twice, checking for
528 * length after taking the first one.
530 if (leaf_nodes[head_leaf].count >
531 internal_nodes[head_branch].count) {
532 a = &internal_nodes[head_branch];
533 head_branch++;
534 if (internal_len == 1) {
535 b = &leaf_nodes[head_leaf];
536 head_leaf++;
537 goto done;
539 } else {
540 a = &leaf_nodes[head_leaf];
541 head_leaf++;
542 if (leaf_len == 1) {
543 b = &internal_nodes[head_branch];
544 head_branch++;
545 goto done;
548 /* the other node */
549 if (leaf_nodes[head_leaf].count >
550 internal_nodes[head_branch].count) {
551 b = &internal_nodes[head_branch];
552 head_branch++;
553 } else {
554 b = &leaf_nodes[head_leaf];
555 head_leaf++;
558 done:
560 * Now we add a new node to the subtrees list that
561 * combines the score of node_a and node_b, and points
562 * to them as children.
564 internal_nodes[tail_branch].count = a->count + b->count;
565 internal_nodes[tail_branch].left = a;
566 internal_nodes[tail_branch].right = b;
567 tail_branch++;
568 if (tail_branch == n_leaves) {
570 * We're not getting here, no way, never ever.
571 * Unless we made a terrible mistake.
573 * That is, in a binary tree with n leaves,
574 * there are ALWAYS n-1 internal nodes.
576 return LZXPRESS_ERROR;
579 if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
580 debug_huffman_tree(huffman_root);
583 * We have a tree, and need to turn it into a lookup table,
584 * and see if it is shallow enough (<= 15).
586 less_than_15_bits = check_and_record_depths(huffman_root);
587 if (less_than_15_bits) {
589 * Now the leaf nodes know how deep they are, and we
590 * no longer need the internal nodes.
592 * We need to sort the nodes of equal depth, so that
593 * they are sorted by depth first, and symbol value
594 * second. The internal_nodes can again be auxiliary
595 * memory.
597 stable_sort(
598 leaf_nodes,
599 internal_nodes,
600 n_leaves,
601 sizeof(struct huffman_node),
602 (samba_compare_fn_t)compare_huffman_node_depth);
604 encode_values(leaf_nodes, n_leaves, symbol_values);
606 return n_leaves;
610 * requantize by halving and rounding up, so that small counts
611 * become relatively bigger. This will lead to a flatter tree.
613 for (i = 0; i < n_leaves; i++) {
614 leaf_nodes[i].count >>= 1;
615 leaf_nodes[i].count += 1;
617 head_leaf = 0;
618 head_branch = 0;
619 tail_branch = 0;
621 return LZXPRESS_ERROR;
625 * LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the
626 * circular hash table for a match, before we give up. A bigger number will
627 * generally lead to better but slower compression, but a stupidly big number
628 * will just be worse.
630 * If you're fiddling with this, consider also fiddling with
631 * LZX_HUFF_COMP_HASH_BITS.
633 #define LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS 5
635 static inline void store_match(uint16_t *hash_table,
636 uint16_t h,
637 uint16_t offset)
639 int i;
640 uint16_t o = hash_table[h];
641 uint16_t h2;
642 uint16_t worst_h;
643 int worst_score;
645 if (o == 0xffff) {
646 /* there is nothing there yet */
647 hash_table[h] = offset;
648 return;
650 for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
651 h2 = (h + i) & HASH_MASK;
652 if (hash_table[h2] == 0xffff) {
653 hash_table[h2] = offset;
654 return;
658 * There are no slots, but we really want to store this, so we'll kick
659 * out the one with the longest distance.
661 worst_h = h;
662 worst_score = offset - o;
663 for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
664 int score;
665 h2 = (h + i) & HASH_MASK;
666 o = hash_table[h2];
667 score = offset - o;
668 if (score > worst_score) {
669 worst_score = score;
670 worst_h = h2;
673 hash_table[worst_h] = offset;
678 * Yes, struct match looks a lot like a DATA_BLOB.
680 struct match {
681 const uint8_t *there;
682 size_t length;
686 static inline struct match lookup_match(uint16_t *hash_table,
687 uint16_t h,
688 const uint8_t *data,
689 const uint8_t *here,
690 size_t max_len)
692 int i;
693 uint16_t o = hash_table[h];
694 uint16_t h2;
695 size_t len;
696 const uint8_t *there = NULL;
697 struct match best = {0};
699 for (i = 0; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
700 h2 = (h + i) & HASH_MASK;
701 o = hash_table[h2];
702 if (o == 0xffff) {
704 * in setting this, we would never have stepped over
705 * an 0xffff, so we won't now.
707 break;
709 there = data + o;
710 if (here - there > 65534 || there > here) {
711 continue;
715 * When we already have a long match, we can try to avoid
716 * measuring out another long, but shorter match.
718 if (best.length > 1000 &&
719 there[best.length - 1] != best.there[best.length - 1]) {
720 continue;
723 for (len = 0;
724 len < max_len && here[len] == there[len];
725 len++) {
726 /* counting */
728 if (len > 2) {
730 * As a tiebreaker, we prefer the closer match which
731 * is likely to encode smaller (and certainly no worse).
733 if (len > best.length ||
734 (len == best.length && there > best.there)) {
735 best.length = len;
736 best.there = there;
740 return best;
745 static ssize_t lz77_encode_block(struct lzxhuff_compressor_context *cmp_ctx,
746 struct lzxhuff_compressor_mem *cmp_mem,
747 uint16_t *hash_table,
748 uint16_t *prev_hash_table)
750 uint16_t *intermediate = cmp_mem->intermediate;
751 struct huffman_node *leaf_nodes = cmp_mem->leaf_nodes;
752 uint16_t *symbol_values = cmp_mem->symbol_values;
753 size_t i, j, intermediate_len;
754 const uint8_t *data = cmp_ctx->input_bytes + cmp_ctx->input_pos;
755 const uint8_t *prev_block = NULL;
756 size_t remaining_size = cmp_ctx->input_size - cmp_ctx->input_pos;
757 size_t block_end = MIN(65536, remaining_size);
758 struct match match;
759 int n_symbols;
761 if (cmp_ctx->input_size < cmp_ctx->input_pos) {
762 return LZXPRESS_ERROR;
765 if (cmp_ctx->prev_block_pos != cmp_ctx->input_pos) {
766 prev_block = cmp_ctx->input_bytes + cmp_ctx->prev_block_pos;
767 } else if (prev_hash_table != NULL) {
768 /* we've got confused! hash and block should go together */
769 return LZXPRESS_ERROR;
773 * leaf_nodes is used to count the symbols seen, for later Huffman
774 * encoding.
776 for (i = 0; i < 512; i++) {
777 leaf_nodes[i] = (struct huffman_node) {
778 .symbol = i
782 j = 0;
784 if (remaining_size < 41 || DEBUG_NO_LZ77_MATCHES) {
786 * There is no point doing a hash table and looking for
787 * matches in this tiny block (remembering we are committed to
788 * using 32 bits, so there's a good chance we wouldn't even
789 * save a byte). The threshold of 41 matches Windows.
790 * If remaining_size < 3, we *can't* do the hash.
792 i = 0;
793 } else {
795 * We use 0xffff as the unset value for table, because it is
796 * not a valid match offset (and 0x0 is).
798 memset(hash_table, 0xff, sizeof(cmp_mem->hash_table1));
800 for (i = 0; i <= block_end - 3; i++) {
801 uint16_t code;
802 const uint8_t *here = data + i;
803 uint16_t h = three_byte_hash(here);
804 size_t max_len = MIN(remaining_size - i, MAX_MATCH_LENGTH);
805 match = lookup_match(hash_table,
807 data,
808 here,
809 max_len);
811 if (match.there == NULL && prev_hash_table != NULL) {
813 * If this is not the first block,
814 * backreferences can look into the previous
815 * block (but only as far as 65535 bytes, so
816 * the end of this block cannot see the start
817 * of the last one).
819 match = lookup_match(prev_hash_table,
821 prev_block,
822 here,
823 remaining_size - i);
826 store_match(hash_table, h, i);
828 if (match.there == NULL) {
829 /* add a literal and move on. */
830 uint8_t c = data[i];
831 leaf_nodes[c].count++;
832 intermediate[j] = c;
833 j++;
834 continue;
837 /* a real match */
838 if (match.length <= 65538) {
839 intermediate[j] = 0xffff;
840 intermediate[j + 1] = match.length - 3;
841 intermediate[j + 2] = here - match.there;
842 j += 3;
843 } else {
844 size_t m = match.length - 3;
845 intermediate[j] = 0xfffe;
846 intermediate[j + 1] = m & 0xffff;
847 intermediate[j + 2] = m >> 16;
848 intermediate[j + 3] = here - match.there;
849 j += 4;
851 code = encode_match(match.length, here - match.there);
852 leaf_nodes[code].count++;
853 i += match.length - 1; /* `- 1` for the loop i++ */
855 * A match can take us past the intended block length,
856 * extending the block. We don't need to do anything
857 * special for this case -- the loops will naturally
858 * do the right thing.
864 * There might be some bytes at the end.
866 for (; i < block_end; i++) {
867 leaf_nodes[data[i]].count++;
868 intermediate[j] = data[i];
869 j++;
872 if (i == remaining_size) {
873 /* add a trailing EOF marker (256) */
874 intermediate[j] = 0xffff;
875 intermediate[j + 1] = 0;
876 intermediate[j + 2] = 1;
877 j += 3;
878 leaf_nodes[256].count++;
881 intermediate_len = j;
883 cmp_ctx->prev_block_pos = cmp_ctx->input_pos;
884 cmp_ctx->input_pos += i;
886 /* fill in the symbols table */
887 n_symbols = generate_huffman_codes(leaf_nodes,
888 cmp_mem->internal_nodes,
889 symbol_values);
890 if (n_symbols < 0) {
891 return n_symbols;
894 return intermediate_len;
899 static ssize_t write_huffman_table(uint16_t symbol_values[512],
900 uint8_t *output,
901 size_t available_size)
903 size_t i;
905 if (available_size < 256) {
906 return LZXPRESS_ERROR;
909 for (i = 0; i < 256; i++) {
910 uint8_t b = 0;
911 uint16_t even = symbol_values[i * 2];
912 uint16_t odd = symbol_values[i * 2 + 1];
913 if (even != 0) {
914 b = bitlen_nonzero_16(even);
916 if (odd != 0) {
917 b |= bitlen_nonzero_16(odd) << 4;
919 output[i] = b;
921 return i;
925 struct write_context {
926 uint8_t *dest;
927 size_t dest_len;
928 size_t head; /* where lengths go */
929 size_t next_code; /* where symbol stream goes */
930 size_t pending_next_code; /* will be next_code */
931 unsigned bit_len;
932 uint32_t bits;
936 * Write out 16 bits, little-endian, for write_huffman_codes()
938 * As you'll notice, there's a bit to do.
940 * We are collecting up bits in a uint32_t, then when there are 16 of them we
941 * write out a word into the stream, using a trio of offsets (wc->next_code,
942 * wc->pending_next_code, and wc->head) which dance around ensuring that the
943 * bitstream and the interspersed lengths are in the right places relative to
944 * each other.
947 static inline bool write_bits(struct write_context *wc,
948 uint16_t code, uint16_t length)
950 wc->bits <<= length;
951 wc->bits |= code;
952 wc->bit_len += length;
953 if (wc->bit_len > 16) {
954 uint32_t w = wc->bits >> (wc->bit_len - 16);
955 wc->bit_len -= 16;
956 if (wc->next_code + 2 > wc->dest_len ||
957 unlikely(wc->bit_len > 16)) {
958 return false;
960 wc->dest[wc->next_code] = w & 0xff;
961 wc->dest[wc->next_code + 1] = (w >> 8) & 0xff;
962 wc->next_code = wc->pending_next_code;
963 wc->pending_next_code = wc->head;
964 wc->head += 2;
966 return true;
970 static inline bool write_code(struct write_context *wc, uint16_t code)
972 int code_bit_len = bitlen_nonzero_16(code);
973 if (unlikely(code == 0)) {
974 return false;
976 code &= (1 << code_bit_len) - 1;
977 return write_bits(wc, code, code_bit_len);
980 static inline bool write_byte(struct write_context *wc, uint8_t byte)
982 if (wc->head + 1 > wc->dest_len) {
983 return false;
985 wc->dest[wc->head] = byte;
986 wc->head++;
987 return true;
991 static inline bool write_long_len(struct write_context *wc, size_t len)
993 if (len < 65535) {
994 if (wc->head + 3 > wc->dest_len) {
995 return false;
997 wc->dest[wc->head] = 255;
998 wc->dest[wc->head + 1] = len & 255;
999 wc->dest[wc->head + 2] = len >> 8;
1000 wc->head += 3;
1001 } else {
1002 if (wc->head + 7 > wc->dest_len) {
1003 return false;
1005 wc->dest[wc->head] = 255;
1006 wc->dest[wc->head + 1] = 0;
1007 wc->dest[wc->head + 2] = 0;
1008 wc->dest[wc->head + 3] = len & 255;
1009 wc->dest[wc->head + 4] = (len >> 8) & 255;
1010 wc->dest[wc->head + 5] = (len >> 16) & 255;
1011 wc->dest[wc->head + 6] = (len >> 24) & 255;
1012 wc->head += 7;
1014 return true;
1017 static ssize_t write_compressed_bytes(uint16_t symbol_values[512],
1018 uint16_t *intermediate,
1019 size_t intermediate_len,
1020 uint8_t *dest,
1021 size_t dest_len)
1023 bool ok;
1024 size_t i;
1025 size_t end;
1026 struct write_context wc = {
1027 .head = 4,
1028 .pending_next_code = 2,
1029 .dest = dest,
1030 .dest_len = dest_len
1032 for (i = 0; i < intermediate_len; i++) {
1033 uint16_t c = intermediate[i];
1034 size_t len;
1035 uint16_t distance;
1036 uint16_t code_len = 0;
1037 uint16_t code_dist = 0;
1038 if (c < 256) {
1039 ok = write_code(&wc, symbol_values[c]);
1040 if (!ok) {
1041 return LZXPRESS_ERROR;
1043 continue;
1046 if (c == 0xfffe) {
1047 if (i > intermediate_len - 4) {
1048 return LZXPRESS_ERROR;
1051 len = intermediate[i + 1];
1052 len |= (uint32_t)intermediate[i + 2] << 16;
1053 distance = intermediate[i + 3];
1054 i += 3;
1055 } else if (c == 0xffff) {
1056 if (i > intermediate_len - 3) {
1057 return LZXPRESS_ERROR;
1059 len = intermediate[i + 1];
1060 distance = intermediate[i + 2];
1061 i += 2;
1062 } else {
1063 return LZXPRESS_ERROR;
1065 if (unlikely(distance == 0)) {
1066 return LZXPRESS_ERROR;
1068 /* len has already had 3 subtracted */
1069 if (len >= 15) {
1071 * We are going to need to write extra length
1072 * bytes into the stream, but we don't do it
1073 * now, we do it after the code has been
1074 * written (and before the distance bits).
1076 code_len = 15;
1077 } else {
1078 code_len = len;
1080 code_dist = bitlen_nonzero_16(distance);
1081 c = 256 | (code_dist << 4) | code_len;
1082 if (c > 511) {
1083 return LZXPRESS_ERROR;
1086 ok = write_code(&wc, symbol_values[c]);
1087 if (!ok) {
1088 return LZXPRESS_ERROR;
1091 if (code_len == 15) {
1092 if (len >= 270) {
1093 ok = write_long_len(&wc, len);
1094 } else {
1095 ok = write_byte(&wc, len - 15);
1097 if (! ok) {
1098 return LZXPRESS_ERROR;
1101 if (code_dist != 0) {
1102 uint16_t dist_bits = distance - (1 << code_dist);
1103 ok = write_bits(&wc, dist_bits, code_dist);
1104 if (!ok) {
1105 return LZXPRESS_ERROR;
1110 * There are some intricacies around flushing the bits and returning
1111 * the length.
1113 * If the returned length is not exactly right and there is another
1114 * block, that block will read its huffman table from the wrong place,
1115 * and have all the symbol codes out by a multiple of 4.
1117 end = wc.head;
1118 if (wc.bit_len == 0) {
1119 end -= 2;
1121 ok = write_bits(&wc, 0, 16 - wc.bit_len);
1122 if (!ok) {
1123 return LZXPRESS_ERROR;
1125 for (i = 0; i < 2; i++) {
1127 * Flush out the bits with zeroes. It doesn't matter if we do
1128 * a round too many, as we have buffer space, and have already
1129 * determined the returned length (end).
1131 ok = write_bits(&wc, 0, 16);
1132 if (!ok) {
1133 return LZXPRESS_ERROR;
1136 return end;
1140 static ssize_t lzx_huffman_compress_block(struct lzxhuff_compressor_context *cmp_ctx,
1141 struct lzxhuff_compressor_mem *cmp_mem,
1142 size_t block_no)
1144 ssize_t intermediate_size;
1145 uint16_t *hash_table = NULL;
1146 uint16_t *back_window_hash_table = NULL;
1147 ssize_t bytes_written;
1149 if (cmp_ctx->available_size - cmp_ctx->output_pos < 260) {
1150 /* huffman block + 4 bytes */
1151 return LZXPRESS_ERROR;
1155 * For LZ77 compression, we keep a hash table for the previous block,
1156 * via alternation after the first block.
1158 * LZ77 writes into the intermediate buffer in the cmp_mem context.
1160 if (block_no == 0) {
1161 hash_table = cmp_mem->hash_table1;
1162 back_window_hash_table = NULL;
1163 } else if (block_no & 1) {
1164 hash_table = cmp_mem->hash_table2;
1165 back_window_hash_table = cmp_mem->hash_table1;
1166 } else {
1167 hash_table = cmp_mem->hash_table1;
1168 back_window_hash_table = cmp_mem->hash_table2;
1171 intermediate_size = lz77_encode_block(cmp_ctx,
1172 cmp_mem,
1173 hash_table,
1174 back_window_hash_table);
1176 if (intermediate_size < 0) {
1177 return intermediate_size;
1181 * Write the 256 byte Huffman table, based on the counts gained in
1182 * LZ77 phase.
1184 bytes_written = write_huffman_table(
1185 cmp_mem->symbol_values,
1186 cmp_ctx->output + cmp_ctx->output_pos,
1187 cmp_ctx->available_size - cmp_ctx->output_pos);
1189 if (bytes_written != 256) {
1190 return LZXPRESS_ERROR;
1192 cmp_ctx->output_pos += 256;
1195 * Write the compressed bytes using the LZ77 matches and Huffman codes
1196 * worked out in the previous steps.
1198 bytes_written = write_compressed_bytes(
1199 cmp_mem->symbol_values,
1200 cmp_mem->intermediate,
1201 intermediate_size,
1202 cmp_ctx->output + cmp_ctx->output_pos,
1203 cmp_ctx->available_size - cmp_ctx->output_pos);
1205 if (bytes_written < 0) {
1206 return bytes_written;
1209 cmp_ctx->output_pos += bytes_written;
1210 return bytes_written;
1214 * lzxpress_huffman_max_compressed_size()
1216 * Return the most bytes the compression can take, to allow
1217 * pre-allocation.
1219 size_t lzxpress_huffman_max_compressed_size(size_t input_size)
1222 * In the worst case, the output size should be about the same as the
1223 * input size, plus the 256 byte header per 64k block. We aim for
1224 * ample, but within the order of magnitude.
1226 return input_size + (input_size / 8) + 270;
1230 * lzxpress_huffman_compress_talloc()
1232 * This is the convenience function that allocates the compressor context and
1233 * output memory for you. The return value is the number of bytes written to
1234 * the location indicated by the output pointer.
1236 * The maximum input_size is effectively around 227MB due to the need to guess
1237 * an upper bound on the output size that hits an internal limitation in
1238 * talloc.
1240 * @param mem_ctx TALLOC_CTX parent for the compressed buffer.
1241 * @param input_bytes memory to be compressed.
1242 * @param input_size length of the input buffer.
1243 * @param output destination pointer for the compressed data.
1245 * @return the number of bytes written or -1 on error.
1248 ssize_t lzxpress_huffman_compress_talloc(TALLOC_CTX *mem_ctx,
1249 const uint8_t *input_bytes,
1250 size_t input_size,
1251 uint8_t **output)
1253 struct lzxhuff_compressor_mem *cmp = NULL;
1254 size_t alloc_size = lzxpress_huffman_max_compressed_size(input_size);
1256 ssize_t output_size;
1258 *output = talloc_array(mem_ctx, uint8_t, alloc_size);
1259 if (*output == NULL) {
1260 return LZXPRESS_ERROR;
1263 cmp = talloc(mem_ctx, struct lzxhuff_compressor_mem);
1264 if (cmp == NULL) {
1265 TALLOC_FREE(*output);
1266 return LZXPRESS_ERROR;
1269 output_size = lzxpress_huffman_compress(cmp,
1270 input_bytes,
1271 input_size,
1272 *output,
1273 alloc_size);
1275 talloc_free(cmp);
1277 if (output_size < 0) {
1278 TALLOC_FREE(*output);
1279 return LZXPRESS_ERROR;
1282 *output = talloc_realloc(mem_ctx, *output, uint8_t, output_size);
1283 if (*output == NULL) {
1284 return LZXPRESS_ERROR;
1287 return output_size;
1291 * lzxpress_huffman_compress()
1293 * This is the inconvenience function, slightly faster and fiddlier than
1294 * lzxpress_huffman_compress_talloc().
1296 * To use this, you need to have allocated (but not initialised) a `struct
1297 * lzxhuff_compressor_mem`, and an output buffer. If the buffer is not big
1298 * enough (per `output_size`), you'll get a negative return value, otherwise
1299 * the number of bytes actually consumed, which will always be at least 260.
1301 * The `struct lzxhuff_compressor_mem` is reusable -- it is basically a
1302 * collection of uninitialised memory buffers. The total size is less than
1303 * 150k, so stack allocation is plausible.
1305 * input_size and available_size are limited to the minimum of UINT32_MAX and
1306 * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
1308 * @param cmp_mem a struct lzxhuff_compressor_mem.
1309 * @param input_bytes memory to be compressed.
1310 * @param input_size length of the input buffer.
1311 * @param output destination for the compressed data.
1312 * @param available_size allocated output bytes.
1314 * @return the number of bytes written or -1 on error.
1316 ssize_t lzxpress_huffman_compress(struct lzxhuff_compressor_mem *cmp_mem,
1317 const uint8_t *input_bytes,
1318 size_t input_size,
1319 uint8_t *output,
1320 size_t available_size)
1322 size_t i = 0;
1323 struct lzxhuff_compressor_context cmp_ctx = {
1324 .input_bytes = input_bytes,
1325 .input_size = input_size,
1326 .input_pos = 0,
1327 .prev_block_pos = 0,
1328 .output = output,
1329 .available_size = available_size,
1330 .output_pos = 0
1333 if (input_size == 0) {
1335 * We can't deal with this for a number of reasons (e.g. it
1336 * breaks the Huffman tree), and the output will be infinitely
1337 * bigger than the input. The caller needs to go and think
1338 * about what they're trying to do here.
1340 return LZXPRESS_ERROR;
1343 if (input_size > SSIZE_MAX ||
1344 input_size > UINT32_MAX ||
1345 available_size > SSIZE_MAX ||
1346 available_size > UINT32_MAX ||
1347 available_size == 0) {
1349 * We use negative ssize_t to return errors, which is limiting
1350 * on 32 bit machines; otherwise we adhere to Microsoft's 4GB
1351 * limit.
1353 * lzxpress_huffman_compress_talloc() will not get this far,
1354 * having already have failed on talloc's 256 MB limit.
1356 return LZXPRESS_ERROR;
1359 if (cmp_mem == NULL ||
1360 output == NULL ||
1361 input_bytes == NULL) {
1362 return LZXPRESS_ERROR;
1365 while (cmp_ctx.input_pos < cmp_ctx.input_size) {
1366 ssize_t ret;
1367 ret = lzx_huffman_compress_block(&cmp_ctx,
1368 cmp_mem,
1370 if (ret < 0) {
1371 return ret;
1373 i++;
1376 return cmp_ctx.output_pos;
1379 static void debug_tree_codes(struct bitstream *input)
1383 size_t head = 0;
1384 size_t tail = 2;
1385 size_t ffff_count = 0;
1386 struct q {
1387 uint16_t tree_code;
1388 uint16_t code_code;
1390 struct q queue[65536];
1391 char bits[17];
1392 uint16_t *t = input->table;
1393 queue[0].tree_code = 1;
1394 queue[0].code_code = 2;
1395 queue[1].tree_code = 2;
1396 queue[1].code_code = 3;
1397 while (head < tail) {
1398 struct q q = queue[head];
1399 uint16_t x = t[q.tree_code];
1400 if (x != 0xffff) {
1401 int k;
1402 uint16_t j = q.code_code;
1403 size_t offset = bitlen_nonzero_16(j) - 1;
1404 if (unlikely(j == 0)) {
1405 DBG("BROKEN code is 0!\n");
1406 return;
1409 for (k = 0; k <= offset; k++) {
1410 bool b = (j >> (offset - k)) & 1;
1411 bits[k] = b ? '1' : '0';
1413 bits[k] = 0;
1414 DBG("%03x %s\n", x & 511, bits);
1415 head++;
1416 continue;
1418 ffff_count++;
1419 queue[tail].tree_code = q.tree_code * 2 + 1;
1420 queue[tail].code_code = q.code_code * 2;
1421 tail++;
1422 queue[tail].tree_code = q.tree_code * 2 + 1 + 1;
1423 queue[tail].code_code = q.code_code * 2 + 1;
1424 tail++;
1425 head++;
1427 DBG("0xffff count: %zu\n", ffff_count);
1431 * Determines the sort order of one prefix_code_symbol relative to another
1433 static int compare_uint16(const uint16_t *a, const uint16_t *b)
1435 if (*a < *b) {
1436 return -1;
1438 if (*a > *b) {
1439 return 1;
1441 return 0;
1445 static bool fill_decomp_table(struct bitstream *input)
1448 * There are 512 symbols, each encoded in 4 bits, which indicates
1449 * their depth in the Huffman tree. The even numbers get the lower
1450 * nibble of each byte, so that the byte hex values look backwards
1451 * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
1452 * order of appearance, per depth.
1454 * For example, if the first two bytes were:
1456 * 0x23 0x53
1458 * the first four codes have the lengths 3, 2, 3, 5.
1459 * Let's call them A, B, C, D.
1461 * Suppose there is no other codeword with length 1 (which is
1462 * necessarily true in this example) or 2, but there might be others
1463 * of length 3 or 4. Then we can say this about the codes:
1465 * _ --*--_
1466 * / \
1467 * 0 1
1468 * / \ / \
1469 * 0 1 0 1
1470 * B |\ / \ |\
1471 * 0 1 0 1 0 1
1472 * A C |\ /| | |\
1474 * pos bits code
1475 * A 3 010
1476 * B 2 00
1477 * C 3 011
1478 * D 5 1????
1480 * B has the shortest code, so takes the leftmost branch, 00. That
1481 * ends the branch -- nothing else can start with 00. There are no
1482 * more 2s, so we look at the 3s, starting as far left as possible. So
1483 * A takes 010 and C takes 011. That means everything else has to
1484 * start with 1xx. We don't know how many codewords of length 3 or 4
1485 * there are; if there are none, D would end up with 10000, the
1486 * leftmost available code of length 5. If the compressor is any good,
1487 * there should be no unused leaf nodes left dangling at the end.
1489 * (this is "Canonical Huffman Coding").
1492 * But what symbols do these codes actually stand for?
1493 * --------------------------------------------------
1495 * Good question. The first 256 codes stand for the corresponding
1496 * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
1497 * which have a distance and a length, encoded in a strange way that
1498 * isn't entirely the purview of this function.
1500 * What does the value 0 mean?
1501 * ---------------------------
1503 * The code does not occur. For example, if the next byte in the
1504 * example above was 0x07, that would give the byte 0x04 a 7-long
1505 * code, and no code to the 0x05 byte, which means we there is no way
1506 * we going to see a 5 in the decoded stream.
1508 * Isn't LZ77 + Huffman what zip/gzip/zlib do?
1509 * -------------------------------------------
1511 * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
1513 uint16_t symbols[512];
1514 uint16_t sort_mem[512];
1515 size_t i, n_symbols;
1516 ssize_t code;
1517 uint16_t len = 0, prev_len;
1518 const uint8_t *table_bytes = input->bytes + input->byte_pos;
1520 if (input->byte_pos + 260 > input->byte_size) {
1521 return false;
1524 n_symbols = 0;
1525 for (i = 0; i < 256; i++) {
1526 uint16_t even = table_bytes[i] & 15;
1527 uint16_t odd = table_bytes[i] >> 4;
1528 if (even != 0) {
1529 symbols[n_symbols] = (even << 9) + i * 2;
1530 n_symbols++;
1532 if (odd != 0) {
1533 symbols[n_symbols] = (odd << 9) + i * 2 + 1;
1534 n_symbols++;
1537 input->byte_pos += 256;
1538 if (n_symbols == 0) {
1539 return false;
1542 stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
1543 (samba_compare_fn_t)compare_uint16);
1546 * we're using an implicit binary tree, as you'd see in a heap.
1547 * table[0] = unused
1548 * table[1] = '0'
1549 * table[2] = '1'
1550 * table[3] = '00' <-- '00' and '01' are children of '0'
1551 * table[4] = '01' <-- '0' is [0], children are [0 * 2 + {1,2}]
1552 * table[5] = '10'
1553 * table[6] = '11'
1554 * table[7] = '000'
1555 * table[8] = '001'
1556 * table[9] = '010'
1557 * table[10]= '011'
1558 * table[11]= '100
1560 * table[1 << n - 1] = '0' * n
1561 * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
1562 * table[1 << n - 2] = '1' * (n - 1)
1564 * table[i]->left = table[i*2 + 1]
1565 * table[i]->right = table[i*2 + 2]
1566 * table[0xffff] = unused (16 '0's, max len is 15)
1568 * therefore e.g. table[70] = table[64 - 1 + 7]
1569 * = table[1 << 6 - 1 + 7]
1570 * = '000111' (binary 7, widened to 6 bits)
1572 * and if '000111' is a code,
1573 * '00011', '0001', '000', '00', '0' are unavailable prefixes.
1574 * 34 16 7 3 1 are their indices
1575 * and (i - 1) >> 1 is the path back from 70 through these.
1577 * the lookup is
1579 * 1 start with i = 0
1580 * 2 extract a symbol bit (i = (i << 1) + bit + 1)
1581 * 3 is table[i] == 0xffff?
1582 * 4 yes -- goto 2
1583 * 4 table[i] & 511 is the symbol, stop
1585 * and the construction (here) is sort of the reverse.
1587 * Most of this table is free space that can never be reached, and
1588 * most of the activity is at the beginning (since all codes start
1589 * there, and by design the shortest codes are the most common).
1591 for (i = 0; i < 32; i++) {
1592 /* prefill the table head */
1593 input->table[i] = 0xffff;
1595 code = -1;
1596 prev_len = 0;
1597 for (i = 0; i < n_symbols; i++) {
1598 uint16_t s = symbols[i];
1599 uint16_t prefix;
1600 len = (s >> 9) & 15;
1601 s &= 511;
1602 code++;
1603 while (len != prev_len) {
1604 code <<= 1;
1605 code++;
1606 prev_len++;
1609 if (code >= 65535) {
1610 return false;
1612 input->table[code] = s;
1613 for(prefix = (code - 1) >> 1;
1614 prefix > 31;
1615 prefix = (prefix - 1) >> 1) {
1616 input->table[prefix] = 0xffff;
1619 if (CHECK_DEBUGLVL(10)) {
1620 debug_tree_codes(input);
1624 * check that the last code encodes 11111..., with right number of
1625 * ones, pointing to the right symbol -- otherwise we have a dangling
1626 * uninitialised symbol.
1628 if (code != (1 << (len + 1)) - 2) {
1629 return false;
1631 return true;
1635 #define CHECK_READ_32(dest) \
1636 do { \
1637 if (input->byte_pos + 4 > input->byte_size) { \
1638 return LZXPRESS_ERROR; \
1640 dest = PULL_LE_U32(input->bytes, input->byte_pos); \
1641 input->byte_pos += 4; \
1642 } while (0)
1644 #define CHECK_READ_16(dest) \
1645 do { \
1646 if (input->byte_pos + 2 > input->byte_size) { \
1647 return LZXPRESS_ERROR; \
1649 dest = PULL_LE_U16(input->bytes, input->byte_pos); \
1650 input->byte_pos += 2; \
1651 } while (0)
1653 #define CHECK_READ_8(dest) \
1654 do { \
1655 if (input->byte_pos >= input->byte_size) { \
1656 return LZXPRESS_ERROR; \
1658 dest = PULL_LE_U8(input->bytes, input->byte_pos); \
1659 input->byte_pos++; \
1660 } while(0)
1663 static inline ssize_t pull_bits(struct bitstream *input)
1665 if (input->byte_pos + 1 < input->byte_size) {
1666 uint16_t tmp;
1667 CHECK_READ_16(tmp);
1668 input->remaining_bits += 16;
1669 input->bits <<= 16;
1670 input->bits |= tmp;
1671 } else if (input->byte_pos < input->byte_size) {
1672 uint8_t tmp;
1673 CHECK_READ_8(tmp);
1674 input->remaining_bits += 8;
1675 input->bits <<= 8;
1676 input->bits |= tmp;
1677 } else {
1678 return LZXPRESS_ERROR;
1680 return 0;
1685 * Decompress a block. The actual decompressed size is returned (or -1 on
1686 * error). The putative block length is 64k (or shorter, if the message ends
1687 * first), but a match can run over the end, extending the block. That's why
1688 * we need the overall output size as well as the block size. A match encoded
1689 * in this block can point back to previous blocks, but not before the
1690 * beginning of the message, so we also need the previously decoded size.
1692 * The compressed block will have 256 bytes for the Huffman table, and at
1693 * least 4 bytes of (possibly padded) encoded values.
1695 static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
1696 uint8_t *output,
1697 size_t block_size,
1698 size_t output_size,
1699 size_t previous_size)
1701 size_t output_pos = 0;
1702 uint16_t symbol;
1703 size_t index;
1704 uint16_t distance_bits_wanted = 0;
1705 size_t distance = 0;
1706 size_t length = 0;
1707 bool ok;
1708 uint32_t tmp;
1709 bool seen_eof_marker = false;
1711 ok = fill_decomp_table(input);
1712 if (! ok) {
1713 return LZXPRESS_ERROR;
1715 if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
1716 debug_huffman_tree_from_table(input->table);
1719 * Always read 32 bits at the start, even if we don't need them.
1721 CHECK_READ_16(tmp);
1722 CHECK_READ_16(input->bits);
1723 input->bits |= tmp << 16;
1724 input->remaining_bits = 32;
1727 * This loop iterates over individual *bits*. These are read from
1728 * little-endian 16 bit words, most significant bit first.
1730 * At points in the bitstream, the following are possible:
1732 * # the source word is empty and needs to be refilled from the input
1733 * stream.
1734 * # an incomplete codeword is being extended.
1735 * # a codeword is resolved, either as a literal or a match.
1736 * # a literal is written.
1737 * # a match is collecting distance bits.
1738 * # the output stream is copied, as specified by a match.
1739 * # input bytes are read for match lengths.
1741 * Note that we *don't* specifically check for the EOF marker (symbol
1742 * 256) in this loop, because the precondition for stopping for the
1743 * EOF marker is that the output buffer is full (otherwise, you
1744 * wouldn't know which 256 is EOF, rather than an actual symbol), and
1745 * we *always* want to stop when the buffer is full. So we work out if
1746 * there is an EOF in another loop after we stop writing.
1749 index = 0;
1750 while (output_pos < block_size) {
1751 uint16_t b;
1752 if (input->remaining_bits == 16) {
1753 ssize_t ret = pull_bits(input);
1754 if (ret) {
1755 return ret;
1758 input->remaining_bits--;
1760 b = (input->bits >> input->remaining_bits) & 1;
1761 if (length == 0) {
1762 /* not in a match; pulling a codeword */
1763 index <<= 1;
1764 index += b + 1;
1765 if (input->table[index] == 0xffff) {
1766 /* incomplete codeword, the common case */
1767 continue;
1769 /* found the symbol, reset the code string */
1770 symbol = input->table[index] & 511;
1771 index = 0;
1772 if (symbol < 256) {
1773 /* a literal, the easy case */
1774 output[output_pos] = symbol;
1775 output_pos++;
1776 continue;
1779 /* the beginning of a match */
1780 distance_bits_wanted = (symbol >> 4) & 15;
1781 distance = 1 << distance_bits_wanted;
1782 length = symbol & 15;
1783 if (length == 15) {
1784 CHECK_READ_8(tmp);
1785 length += tmp;
1786 if (length == 255 + 15) {
1788 * note, we discard (don't add) the
1789 * length so far.
1791 CHECK_READ_16(length);
1792 if (length == 0) {
1793 CHECK_READ_32(length);
1797 length += 3;
1798 } else {
1799 /* we are pulling extra distance bits */
1800 distance_bits_wanted--;
1801 distance |= b << distance_bits_wanted;
1804 if (distance_bits_wanted == 0) {
1806 * We have a complete match, and it is time to do the
1807 * copy (byte by byte, because the ranges can overlap,
1808 * and we might need to copy bytes we just copied in).
1810 * It is possible that this match will extend beyond
1811 * the end of the expected block. That's fine, so long
1812 * as it doesn't extend past the total output size.
1814 size_t i;
1815 size_t end = output_pos + length;
1816 uint8_t *here = output + output_pos;
1817 uint8_t *there = here - distance;
1818 if (end > output_size ||
1819 previous_size + output_pos < distance ||
1820 unlikely(end < output_pos || there > here)) {
1821 return LZXPRESS_ERROR;
1823 for (i = 0; i < length; i++) {
1824 here[i] = there[i];
1826 output_pos += length;
1827 distance = 0;
1828 length = 0;
1832 if (length != 0 || index != 0) {
1833 /* it seems like we've hit an early end, mid-code */
1834 return LZXPRESS_ERROR;
1837 if (input->byte_pos + 256 < input->byte_size) {
1839 * This block is over, but it clearly isn't the last block, so
1840 * we don't want to look for the EOF.
1842 return output_pos;
1845 * We won't write any more, but we try to read some more to make sure
1846 * we're finishing in a good place. That means we want to see a 256
1847 * symbol and then some number of zeroes, possibly zero, but as many
1848 * as 32.
1850 * In this we are perhaps a bit stricter than Windows, which
1851 * apparently does not insist on the EOF marker, nor on a lack of
1852 * trailing bytes.
1854 while (true) {
1855 uint16_t b;
1856 if (input->remaining_bits == 16) {
1857 ssize_t ret;
1858 if (input->byte_pos == input->byte_size) {
1859 /* FIN */
1860 break;
1862 ret = pull_bits(input);
1863 if (ret) {
1864 return ret;
1867 input->remaining_bits--;
1868 b = (input->bits >> input->remaining_bits) & 1;
1869 if (seen_eof_marker) {
1871 * we have read an EOF symbols. Now we just want to
1872 * see zeroes.
1874 if (b != 0) {
1875 return LZXPRESS_ERROR;
1877 continue;
1880 /* we're pulling in a symbol, which had better be 256 */
1881 index <<= 1;
1882 index += b + 1;
1883 if (input->table[index] == 0xffff) {
1884 continue;
1887 symbol = input->table[index] & 511;
1888 if (symbol != 256) {
1889 return LZXPRESS_ERROR;
1891 seen_eof_marker = true;
1892 continue;
1895 if (! seen_eof_marker) {
1896 return LZXPRESS_ERROR;
1899 return output_pos;
1902 static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
1903 uint8_t *output,
1904 size_t output_size)
1906 size_t output_pos = 0;
1908 if (input->byte_size < 260) {
1909 return LZXPRESS_ERROR;
1912 while (input->byte_pos < input->byte_size) {
1913 ssize_t block_output_pos;
1914 ssize_t block_output_size;
1915 size_t remaining_output_size = output_size - output_pos;
1917 block_output_size = MIN(65536, remaining_output_size);
1919 block_output_pos = lzx_huffman_decompress_block(
1920 input,
1921 output + output_pos,
1922 block_output_size,
1923 remaining_output_size,
1924 output_pos);
1926 if (block_output_pos < block_output_size) {
1927 return LZXPRESS_ERROR;
1929 output_pos += block_output_pos;
1930 if (output_pos > output_size) {
1931 /* not expecting to get here. */
1932 return LZXPRESS_ERROR;
1936 if (input->byte_pos != input->byte_size) {
1937 return LZXPRESS_ERROR;
1940 return output_pos;
1945 * lzxpress_huffman_decompress()
1947 * output_size must be the expected length of the decompressed data.
1948 * input_size and output_size are limited to the minimum of UINT32_MAX and
1949 * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
1951 * @param input_bytes memory to be decompressed.
1952 * @param input_size length of the compressed buffer.
1953 * @param output destination for the decompressed data.
1954 * @param output_size exact expected length of the decompressed data.
1956 * @return the number of bytes written or -1 on error.
1959 ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
1960 size_t input_size,
1961 uint8_t *output,
1962 size_t output_size)
1964 uint16_t table[65536];
1965 struct bitstream input = {
1966 .bytes = input_bytes,
1967 .byte_size = input_size,
1968 .byte_pos = 0,
1969 .bits = 0,
1970 .remaining_bits = 0,
1971 .table = table
1974 if (input_size > SSIZE_MAX ||
1975 input_size > UINT32_MAX ||
1976 output_size > SSIZE_MAX ||
1977 output_size > UINT32_MAX ||
1978 input_size == 0 ||
1979 output_size == 0 ||
1980 input_bytes == NULL ||
1981 output == NULL) {
1983 * We use negative ssize_t to return errors, which is limiting
1984 * on 32 bit machines, and the 4GB limit exists on Windows.
1986 return LZXPRESS_ERROR;
1989 return lzxpress_huffman_decompress_internal(&input,
1990 output,
1991 output_size);
1996 * lzxpress_huffman_decompress_talloc()
1998 * The caller must provide the exact size of the expected output.
2000 * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
2001 * output_size is limited to 256MB due to a limit in talloc. This effectively
2002 * limits input_size too, as non-crafted compressed data will not exceed the
2003 * decompressed size by very much.
2005 * @param mem_ctx TALLOC_CTX parent for the decompressed buffer.
2006 * @param input_bytes memory to be decompressed.
2007 * @param input_size length of the compressed buffer.
2008 * @param output_size expected decompressed size.
2010 * @return a talloc'ed buffer exactly output_size in length, or NULL.
2013 uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
2014 const uint8_t *input_bytes,
2015 size_t input_size,
2016 size_t output_size)
2018 ssize_t result;
2019 uint8_t *output = NULL;
2020 struct bitstream input = {
2021 .bytes = input_bytes,
2022 .byte_size = input_size
2025 output = talloc_array(mem_ctx, uint8_t, output_size);
2026 if (output == NULL) {
2027 return NULL;
2030 input.table = talloc_array(mem_ctx, uint16_t, 65536);
2031 if (input.table == NULL) {
2032 talloc_free(output);
2033 return NULL;
2035 result = lzxpress_huffman_decompress_internal(&input,
2036 output,
2037 output_size);
2038 talloc_free(input.table);
2040 if (result != output_size) {
2041 talloc_free(output);
2042 return NULL;
2044 return output;