lib/compression/lzxpress_huffman.c

   1 /*
   2  * Samba compression library - LGPLv3
   3  *
   4  * Copyright © Catalyst IT 2022
   5  *
   6  * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
   7  *        and Jo Sutton       <josutton@catalyst.net.nz>
   8  *
   9  *  ** NOTE! The following LGPL license applies to this file.
  10  *  ** It does NOT imply that all of Samba is released under the LGPL
  11  *
  12  *  This library is free software; you can redistribute it and/or
  13  *  modify it under the terms of the GNU Lesser General Public
  14  *  License as published by the Free Software Foundation; either
  15  *  version 3 of the License, or (at your option) any later version.
  16  *
  17  *  This library is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  *  Lesser General Public License for more details.
  21  *
  22  *  You should have received a copy of the GNU Lesser General Public
  23  *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <talloc.h>
  27
  28 #include "replace.h"
  29 #include "lzxpress_huffman.h"
  30 #include "lib/util/stable_sort.h"
  31 #include "lib/util/debug.h"
  32 #include "lib/util/byteorder.h"
  33 #include "lib/util/bytearray.h"
  34
  35 /*
  36  * DEBUG_NO_LZ77_MATCHES toggles the encoding of matches as matches. If it is
  37  * false the potential match is written as a series of literals, which is a
  38  * valid but usually inefficient encoding. This is useful for isolating a
  39  * problem to either the LZ77 or the Huffman stage.
  40  */
  41 #ifndef DEBUG_NO_LZ77_MATCHES
  42 #define DEBUG_NO_LZ77_MATCHES false
  43 #endif
  44
  45 /*
  46  * DEBUG_HUFFMAN_TREE forces the drawing of ascii art huffman trees during
  47  * compression and decompression.
  48  *
  49  * These trees will also be drawn at DEBUG level 10, but that doesn't work
  50  * with cmocka tests.
  51  */
  52 #ifndef DEBUG_HUFFMAN_TREE
  53 #define DEBUG_HUFFMAN_TREE false
  54 #endif
  55
  56 #if DEBUG_HUFFMAN_TREE
  57 #define DBG(...) fprintf(stderr, __VA_ARGS__)
  58 #else
  59 #define DBG(...) DBG_INFO(__VA_ARGS__)
  60 #endif
  61
  62
  63 #define LZXPRESS_ERROR -1LL
  64
  65 /*
  66  * We won't encode a match length longer than MAX_MATCH_LENGTH.
  67  *
  68  * Reports are that Windows has a limit at 64M.
  69  */
  70 #define MAX_MATCH_LENGTH (64 * 1024 * 1024)
  71
  72
  73 struct bitstream {
  74         const uint8_t *bytes;
  75         size_t byte_pos;
  76         size_t byte_size;
  77         uint32_t bits;
  78         int remaining_bits;
  79         uint16_t *table;
  80 };
  81
  82
  83 #if ! defined __has_builtin
  84 #define __has_builtin(x) 0
  85 #endif
  86
  87 /*
  88  * bitlen_nonzero_16() returns the bit number of the most significant bit, or
  89  * put another way, the integer log base 2. Log(0) is undefined; the argument
  90  * has to be non-zero!
  91  * 1     -> 0
  92  * 2,3   -> 1
  93  * 4-7   -> 2
  94  * 1024  -> 10, etc
  95  *
  96  * Probably this is handled by a compiler intrinsic function that maps to a
  97  * dedicated machine instruction.
  98  */
  99
 100 static inline int bitlen_nonzero_16(uint16_t x)
 101 {
 102 #if  __has_builtin(__builtin_clz)
 103
 104         /* __builtin_clz returns the number of leading zeros */
 105         return (sizeof(unsigned int) * CHAR_BIT) - 1
 106                 - __builtin_clz((unsigned int) x);
 107
 108 #else
 109
 110         int count = -1;
 111         while(x) {
 112                 x >>= 1;
 113                 count++;
 114         }
 115         return count;
 116
 117 #endif
 118 }
 119
 120
 121 struct lzxhuff_compressor_context {
 122         const uint8_t *input_bytes;
 123         size_t input_size;
 124         size_t input_pos;
 125         size_t prev_block_pos;
 126         uint8_t *output;
 127         size_t available_size;
 128         size_t output_pos;
 129 };
 130
 131 static int compare_huffman_node_count(struct huffman_node *a,
 132                                       struct huffman_node *b)
 133 {
 134         return a->count - b->count;
 135 }
 136
 137 static int compare_huffman_node_depth(struct huffman_node *a,
 138                                       struct huffman_node *b)
 139 {
 140         int c = a->depth - b->depth;
 141         if (c != 0) {
 142                 return c;
 143         }
 144         return (int)a->symbol - (int)b->symbol;
 145 }
 146
 147
 148 #define HASH_MASK ((1 << LZX_HUFF_COMP_HASH_BITS) - 1)
 149
 150 static inline uint16_t three_byte_hash(const uint8_t *bytes)
 151 {
 152         /*
 153          * MS-XCA says "three byte hash", but does not specify it.
 154          *
 155          * This one is just cobbled together, but has quite good distribution
 156          * in the 12-14 bit forms, which is what we care about most.
 157          * e.g: 13 bit: median 2048, min 2022, max 2074, stddev 6.0
 158          */
 159         uint16_t a = bytes[0];
 160         uint16_t b = bytes[1] ^ 0x2e;
 161         uint16_t c = bytes[2] ^ 0x55;
 162         uint16_t ca = c - a;
 163         uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a);
 164         return d & HASH_MASK;
 165 }
 166
 167
 168 static inline uint16_t encode_match(size_t len, size_t offset)
 169 {
 170         uint16_t code = 256;
 171         code |= MIN(len - 3, 15);
 172         code |= bitlen_nonzero_16(offset) << 4;
 173         return code;
 174 }
 175
 176 /*
 177  * debug_huffman_tree() uses debug_huffman_tree_print() to draw the Huffman
 178  * tree in ascii art.
 179  *
 180  * Note that the Huffman tree is probably not the same as that implied by the
 181  * canonical Huffman encoding that is finally used. That tree would be the
 182  * same shape, but with the left and right toggled to sort the branches by
 183  * length, after which the symbols for each length sorted by value.
 184  */
 185
 186 static void debug_huffman_tree_print(struct huffman_node *node,
 187                                      int *trail, int depth)
 188 {
 189         if (node->left == NULL) {
 190                 /* time to print a row */
 191                 int j;
 192                 bool branched = false;
 193                 int row[17];
 194                 char c[100];
 195                 int s = node->symbol;
 196                 char code[17];
 197                 if (depth > 15) {
 198                         fprintf(stderr,
 199                                 " \033[1;31m Max depth exceeded! (%d)\033[0m "
 200                                 " symbol %#3x claimed depth %d count %d\n",
 201                                 depth, node->symbol, node->depth, node->count);
 202                         return;
 203                 }
 204                 for (j = depth - 1; j >= 0; j--) {
 205                         if (branched) {
 206                                 if (trail[j] == -1) {
 207                                         row[j] = -3;
 208                                 } else {
 209                                         row[j] = -2;
 210                                 }
 211                         } else if (trail[j] == -1) {
 212                                 row[j] = -1;
 213                                 branched = true;
 214                         } else {
 215                                 row[j] = trail[j];
 216                         }
 217                 }
 218                 for (j = 0; j < depth; j++) {
 219                         switch (row[j]) {
 220                         case -3:
 221                                 code[j] = '1';
 222                                 fprintf(stderr, "        ");
 223                                 break;
 224                         case -2:
 225                                 code[j] = '0';
 226                                 fprintf(stderr, "      │ ");
 227                                 break;
 228                         case -1:
 229                                 code[j] = '1';
 230                                 fprintf(stderr, "      ╰─");
 231                                 break;
 232                         default:
 233                                 code[j] = '0';
 234                                 fprintf(stderr, "%5d─┬─", row[j]);
 235                                 break;
 236                         }
 237                 }
 238                 code[depth] = 0;
 239                 if (s < 32) {
 240                         snprintf(c, sizeof(c),
 241                                 "\033[1;32m%02x\033[0m \033[1;33m%c%c%c\033[0m",
 242                                  s,
 243                                  0xE2, 0x90, 0x80 + s); /* utf-8 for symbol */
 244                 }  else if (s < 127) {
 245                         snprintf(c, sizeof(c),
 246                                  "\033[1;32m%2x\033[0m '\033[10;32m%c\033[0m'",
 247                                  s, s);
 248                 } else if (s < 256) {
 249                         snprintf(c, sizeof(c), "\033[1;32m%2x\033[0m", s);
 250                 } else {
 251                         uint16_t len = (s & 15) + 3;
 252                         uint16_t dbits = ((s >> 4) & 15) + 1;
 253                         snprintf(c, sizeof(c),
 254                                  " \033[0;33mlen:%2d%s, "
 255                                  "dist:%d-%d \033[0m \033[1;32m%3x\033[0m%s",
 256                                  len,
 257                                  len == 18 ? "+" : "",
 258                                  1 << (dbits - 1),
 259                                  (1 << dbits) - 1,
 260                                  s,
 261                                  s == 256 ? " \033[1;31mEOF\033[0m" : "");
 262
 263                 }
 264
 265                 fprintf(stderr, "──%5d %s \033[2;37m%s\033[0m\n",
 266                         node->count, c, code);
 267                 return;
 268         }
 269         trail[depth] = node->count;
 270         debug_huffman_tree_print(node->left, trail, depth + 1);
 271         trail[depth] = -1;
 272         debug_huffman_tree_print(node->right, trail, depth + 1);
 273 }
 274
 275
 276 /*
 277  * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree()
 278  * will print a tree looking something like this:
 279  *
 280  *     7─┬───    3  len:18+, dist:1-1  10f 0
 281  *       ╰─    4─┬─    2─┬───    1 61 'a' 100
 282  *               │       ╰───    1 62 'b' 101
 283  *               ╰─    2─┬───    1 63 'c' 110
 284  *                       ╰───    1  len: 3, dist:1-1  100 EOF 111
 285  *
 286  * This is based off a Huffman root node, and the tree may not be the same as
 287  * the canonical tree.
 288  */
 289 static void debug_huffman_tree(struct huffman_node *root)
 290 {
 291         int trail[17];
 292         debug_huffman_tree_print(root, trail, 0);
 293 }
 294
 295
 296 /*
 297  * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree_from_table()
 298  * will print something like this based on a decoding symbol table.
 299  *
 300  *  Tree from decoding table 9 nodes → 5 codes
 301  * 10000─┬─── 5000  len:18+, dist:1-1  10f 0
 302  *       ╰─ 5000─┬─ 2500─┬─── 1250 61 'a' 100
 303  *               │       ╰─── 1250 62 'b' 101
 304  *               ╰─ 2500─┬─── 1250 63 'c' 110
 305  *                       ╰─── 1250  len: 3, dist:1-1  100 EOF 111
 306  *
 307  * This is the canonical form of the Huffman tree where the actual counts
 308  * aren't known (we use "10000" to help indicate relative frequencies).
 309  */
 310 static void debug_huffman_tree_from_table(uint16_t *table)
 311 {
 312         int trail[17];
 313         struct huffman_node nodes[1024] = {{0}};
 314         uint16_t codes[1024];
 315         size_t n = 1;
 316         size_t i = 0;
 317         codes[0] = 0;
 318         nodes[0].count = 10000;
 319
 320         while (i < n) {
 321                 uint16_t index = codes[i];
 322                 struct huffman_node *node = &nodes[i];
 323                 if (table[index] == 0xffff) {
 324                         /* internal node */
 325                         index <<= 1;
 326                         /* left */
 327                         index++;
 328                         codes[n] = index;
 329                         node->left = nodes + n;
 330                         nodes[n].count = node->count >> 1;
 331                         n++;
 332                         /*right*/
 333                         index++;
 334                         codes[n] = index;
 335                         node->right = nodes + n;
 336                         nodes[n].count = node->count >> 1;
 337                         n++;
 338                 } else {
 339                         /* leaf node */
 340                         node->symbol = table[index] & 511;
 341                 }
 342                 i++;
 343         }
 344
 345         fprintf(stderr,
 346                 "\033[1;34m Tree from decoding table\033[0m "
 347                 "%zu nodes → %zu codes\n",
 348                 n, (n + 1) / 2);
 349         debug_huffman_tree_print(nodes, trail, 0);
 350 }
 351
 352
 353 static bool depth_walk(struct huffman_node *n, uint32_t depth)
 354 {
 355         bool ok;
 356         if (n->left == NULL) {
 357                 /* this is a leaf, record the depth */
 358                 n->depth = depth;
 359                 return true;
 360         }
 361         if (depth > 14) {
 362                 return false;
 363         }
 364         ok = (depth_walk(n->left, depth + 1) &&
 365               depth_walk(n->right, depth + 1));
 366
 367         return ok;
 368 }
 369
 370
 371 static bool check_and_record_depths(struct huffman_node *root)
 372 {
 373         return depth_walk(root, 0);
 374 }
 375
 376
 377 static bool encode_values(struct huffman_node *leaves,
 378                           size_t n_leaves,
 379                           uint16_t symbol_values[512])
 380 {
 381         size_t i;
 382         /*
 383          * See, we have a leading 1 in our internal code representation, which
 384          * indicates the code length.
 385          */
 386         uint32_t code = 1;
 387         uint32_t code_len = 0;
 388         memset(symbol_values, 0, sizeof(uint16_t) * 512);
 389         for (i = 0; i < n_leaves; i++) {
 390                 code <<= leaves[i].depth - code_len;
 391                 code_len = leaves[i].depth;
 392
 393                 symbol_values[leaves[i].symbol] = code;
 394                 code++;
 395         }
 396         /*
 397          * The last code should be 11111... with code_len + 1 ones. The final
 398          * code++ will wrap this round to 1000... with code_len + 1 zeroes.
 399          */
 400
 401         if (code != 2 << code_len) {
 402                 return false;
 403         }
 404         return true;
 405 }
 406
 407
 408 static int generate_huffman_codes(struct huffman_node *leaf_nodes,
 409                                   struct huffman_node *internal_nodes,
 410                                   uint16_t symbol_values[512])
 411 {
 412         size_t head_leaf = 0;
 413         size_t head_branch = 0;
 414         size_t tail_branch = 0;
 415         struct huffman_node *huffman_root = NULL;
 416         size_t i, j;
 417         size_t n_leaves = 0;
 418
 419         /*
 420          * Before we sort the nodes, we can eliminate the unused ones.
 421          */
 422         for (i = 0; i < 512; i++) {
 423                 if (leaf_nodes[i].count) {
 424                         leaf_nodes[n_leaves] = leaf_nodes[i];
 425                         n_leaves++;
 426                 }
 427         }
 428         if (n_leaves == 0) {
 429                 return LZXPRESS_ERROR;
 430         }
 431         if (n_leaves == 1) {
 432                 /*
 433                  * There is *almost* no way this should happen, and it would
 434                  * ruin the tree (because the shortest possible codes are 1
 435                  * bit long, and there are two of them).
 436                  *
 437                  * The only way to get here is in an internal block in a
 438                  * 3-or-more block message (i.e. > 128k), which consists
 439                  * entirely of a match starting in the previous block (if it
 440                  * was the end block, it would have the EOF symbol).
 441                  *
 442                  * What we do is add a dummy symbol which is this one XOR 256.
 443                  * It won't be used in the stream but will balance the tree.
 444                  */
 445                 leaf_nodes[1] = leaf_nodes[0];
 446                 leaf_nodes[1].symbol ^= 0x100;
 447                 n_leaves = 2;
 448         }
 449
 450         /* note, in sort we're using internal_nodes as auxiliary space */
 451         stable_sort(leaf_nodes,
 452                     internal_nodes,
 453                     n_leaves,
 454                     sizeof(struct huffman_node),
 455                     (samba_compare_fn_t)compare_huffman_node_count);
 456
 457         /*
 458          * This outer loop is for re-quantizing the counts if the tree is too
 459          * tall (>15), which we need to do because the final encoding can't
 460          * express a tree that deep.
 461          *
 462          * In theory, this should be a 'while (true)' loop, but we chicken
 463          * out with 10 iterations, just in case.
 464          *
 465          * In practice it will almost always resolve in the first round; if
 466          * not then, in the second or third. Remember we'll looking at 64k or
 467          * less, so the rarest we can have is 1 in 64k; each round of
 468          * quantization effectively doubles its frequency to 1 in 32k, 1 in
 469          * 16k, etc, until we're treating the rare symbol as actually quite
 470          * common.
 471          */
 472         for (j = 0; j < 10; j++) {
 473                 bool less_than_15_bits;
 474                 while (true) {
 475                         struct huffman_node *a = NULL;
 476                         struct huffman_node *b = NULL;
 477                         size_t leaf_len = n_leaves - head_leaf;
 478                         size_t internal_len = tail_branch - head_branch;
 479
 480                         if (leaf_len + internal_len == 1) {
 481                                 /*
 482                                  * We have the complete tree. The root will be
 483                                  * an internal node unless there is just one
 484                                  * symbol, which is already impossible.
 485                                  */
 486                                 if (unlikely(leaf_len == 1)) {
 487                                         return LZXPRESS_ERROR;
 488                                 } else {
 489                                         huffman_root = \
 490                                                 &internal_nodes[head_branch];
 491                                 }
 492                                 break;
 493                         }
 494                         /*
 495                          * We know here we have at least two nodes, and we
 496                          * want to select the two lowest scoring ones. Those
 497                          * have to be either a) the head of each queue, or b)
 498                          * the first two nodes of either queue.
 499                          *
 500                          * The complicating factors are: a) we need to check
 501                          * the length of each queue, and b) in the case of
 502                          * ties, we prefer to pair leaves with leaves.
 503                          *
 504                          * Note a complication we don't have: the leaf node
 505                          * queue never grows, and the subtree queue starts
 506                          * empty and cannot grow beyond n - 1. It feeds on
 507                          * itself. We don't need to think about overflow.
 508                          */
 509                         if (leaf_len == 0) {
 510                                 /* two from subtrees */
 511                                 a = &internal_nodes[head_branch];
 512                                 b = &internal_nodes[head_branch + 1];
 513                                 head_branch += 2;
 514                         } else if (internal_len == 0) {
 515                                 /* two from nodes */
 516                                 a = &leaf_nodes[head_leaf];
 517                                 b = &leaf_nodes[head_leaf + 1];
 518                                 head_leaf += 2;
 519                         } else if (leaf_len == 1 && internal_len == 1) {
 520                                 /* one of each */
 521                                 a = &leaf_nodes[head_leaf];
 522                                 b = &internal_nodes[head_branch];
 523                                 head_branch++;
 524                                 head_leaf++;
 525                         } else {
 526                                 /*
 527                                  * Take the lowest head, twice, checking for
 528                                  * length after taking the first one.
 529                                  */
 530                                 if (leaf_nodes[head_leaf].count >
 531                                     internal_nodes[head_branch].count) {
 532                                         a = &internal_nodes[head_branch];
 533                                         head_branch++;
 534                                         if (internal_len == 1) {
 535                                                 b = &leaf_nodes[head_leaf];
 536                                                 head_leaf++;
 537                                                 goto done;
 538                                         }
 539                                 } else {
 540                                         a = &leaf_nodes[head_leaf];
 541                                         head_leaf++;
 542                                         if (leaf_len == 1) {
 543                                                 b = &internal_nodes[head_branch];
 544                                                 head_branch++;
 545                                                 goto done;
 546                                         }
 547                                 }
 548                                 /* the other node */
 549                                 if (leaf_nodes[head_leaf].count >
 550                                     internal_nodes[head_branch].count) {
 551                                         b = &internal_nodes[head_branch];
 552                                         head_branch++;
 553                                 } else {
 554                                         b = &leaf_nodes[head_leaf];
 555                                         head_leaf++;
 556                                 }
 557                         }
 558                 done:
 559                         /*
 560                          * Now we add a new node to the subtrees list that
 561                          * combines the score of node_a and node_b, and points
 562                          * to them as children.
 563                          */
 564                         internal_nodes[tail_branch].count = a->count + b->count;
 565                         internal_nodes[tail_branch].left = a;
 566                         internal_nodes[tail_branch].right = b;
 567                         tail_branch++;
 568                         if (tail_branch == n_leaves) {
 569                                 /*
 570                                  * We're not getting here, no way, never ever.
 571                                  * Unless we made a terrible mistake.
 572                                  *
 573                                  * That is, in a binary tree with n leaves,
 574                                  * there are ALWAYS n-1 internal nodes.
 575                                  */
 576                                 return LZXPRESS_ERROR;
 577                         }
 578                 }
 579                 if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
 580                         debug_huffman_tree(huffman_root);
 581                 }
 582                 /*
 583                  * We have a tree, and need to turn it into a lookup table,
 584                  * and see if it is shallow enough (<= 15).
 585                  */
 586                 less_than_15_bits = check_and_record_depths(huffman_root);
 587                 if (less_than_15_bits) {
 588                         /*
 589                          * Now the leaf nodes know how deep they are, and we
 590                          * no longer need the internal nodes.
 591                          *
 592                          * We need to sort the nodes of equal depth, so that
 593                          * they are sorted by depth first, and symbol value
 594                          * second. The internal_nodes can again be auxiliary
 595                          * memory.
 596                          */
 597                         stable_sort(
 598                                 leaf_nodes,
 599                                 internal_nodes,
 600                                 n_leaves,
 601                                 sizeof(struct huffman_node),
 602                                 (samba_compare_fn_t)compare_huffman_node_depth);
 603
 604                         encode_values(leaf_nodes, n_leaves, symbol_values);
 605
 606                         return n_leaves;
 607                 }
 608
 609                 /*
 610                  * requantize by halving and rounding up, so that small counts
 611                  * become relatively bigger. This will lead to a flatter tree.
 612                  */
 613                 for (i = 0; i < n_leaves; i++) {
 614                         leaf_nodes[i].count >>= 1;
 615                         leaf_nodes[i].count += 1;
 616                 }
 617                 head_leaf = 0;
 618                 head_branch = 0;
 619                 tail_branch = 0;
 620         }
 621         return LZXPRESS_ERROR;
 622 }
 623
 624 /*
 625  * LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the
 626  * circular hash table for a match, before we give up. A bigger number will
 627  * generally lead to better but slower compression, but a stupidly big number
 628  * will just be worse.
 629  *
 630  * If you're fiddling with this, consider also fiddling with
 631  * LZX_HUFF_COMP_HASH_BITS.
 632  */
 633 #define LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS 5
 634
 635 static inline void store_match(uint16_t *hash_table,
 636                                uint16_t h,
 637                                uint16_t offset)
 638 {
 639         int i;
 640         uint16_t o = hash_table[h];
 641         uint16_t h2;
 642         uint16_t worst_h;
 643         int worst_score;
 644
 645         if (o == 0xffff) {
 646                 /* there is nothing there yet */
 647                 hash_table[h] = offset;
 648                 return;
 649         }
 650         for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
 651                 h2 = (h + i) & HASH_MASK;
 652                 if (hash_table[h2] == 0xffff) {
 653                         hash_table[h2] = offset;
 654                         return;
 655                 }
 656         }
 657         /*
 658          * There are no slots, but we really want to store this, so we'll kick
 659          * out the one with the longest distance.
 660          */
 661         worst_h = h;
 662         worst_score = offset - o;
 663         for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
 664                 int score;
 665                 h2 = (h + i) & HASH_MASK;
 666                 o = hash_table[h2];
 667                 score = offset - o;
 668                 if (score > worst_score) {
 669                         worst_score = score;
 670                         worst_h = h2;
 671                 }
 672         }
 673         hash_table[worst_h] = offset;
 674 }
 675
 676
 677 /*
 678  * Yes, struct match looks a lot like a DATA_BLOB.
 679  */
 680 struct match {
 681         const uint8_t *there;
 682         size_t length;
 683 };
 684
 685
 686 static inline struct match lookup_match(uint16_t *hash_table,
 687                                         uint16_t h,
 688                                         const uint8_t *data,
 689                                         const uint8_t *here,
 690                                         size_t max_len)
 691 {
 692         int i;
 693         uint16_t o = hash_table[h];
 694         uint16_t h2;
 695         size_t len;
 696         const uint8_t *there = NULL;
 697         struct match best = {0};
 698
 699         for (i = 0; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
 700                 h2 = (h + i) & HASH_MASK;
 701                 o = hash_table[h2];
 702                 if (o == 0xffff) {
 703                         /*
 704                          * in setting this, we would never have stepped over
 705                          * an 0xffff, so we won't now.
 706                          */
 707                         break;
 708                 }
 709                 there = data + o;
 710                 if (here - there > 65534 || there > here) {
 711                         continue;
 712                 }
 713
 714                 /*
 715                  * When we already have a long match, we can try to avoid
 716                  * measuring out another long, but shorter match.
 717                  */
 718                 if (best.length > 1000 &&
 719                     there[best.length - 1] != best.there[best.length - 1]) {
 720                         continue;
 721                 }
 722
 723                 for (len = 0;
 724                      len < max_len && here[len] == there[len];
 725                      len++) {
 726                         /* counting */
 727                 }
 728                 if (len > 2) {
 729                         /*
 730                          * As a tiebreaker, we prefer the closer match which
 731                          * is likely to encode smaller (and certainly no worse).
 732                          */
 733                         if (len > best.length ||
 734                             (len == best.length && there > best.there)) {
 735                                 best.length = len;
 736                                 best.there = there;
 737                         }
 738                 }
 739         }
 740         return best;
 741 }
 742
 743
 744
 745 static ssize_t lz77_encode_block(struct lzxhuff_compressor_context *cmp_ctx,
 746                                  struct lzxhuff_compressor_mem *cmp_mem,
 747                                  uint16_t *hash_table,
 748                                  uint16_t *prev_hash_table)
 749 {
 750         uint16_t *intermediate = cmp_mem->intermediate;
 751         struct huffman_node *leaf_nodes = cmp_mem->leaf_nodes;
 752         uint16_t *symbol_values = cmp_mem->symbol_values;
 753         size_t i, j, intermediate_len;
 754         const uint8_t *data = cmp_ctx->input_bytes + cmp_ctx->input_pos;
 755         const uint8_t *prev_block = NULL;
 756         size_t remaining_size = cmp_ctx->input_size - cmp_ctx->input_pos;
 757         size_t block_end = MIN(65536, remaining_size);
 758         struct match match;
 759         int n_symbols;
 760
 761         if (cmp_ctx->input_size < cmp_ctx->input_pos) {
 762                 return LZXPRESS_ERROR;
 763         }
 764
 765         if (cmp_ctx->prev_block_pos != cmp_ctx->input_pos) {
 766                 prev_block = cmp_ctx->input_bytes + cmp_ctx->prev_block_pos;
 767         } else if (prev_hash_table != NULL) {
 768                 /* we've got confused! hash and block should go together */
 769                 return LZXPRESS_ERROR;
 770         }
 771
 772         /*
 773          * leaf_nodes is used to count the symbols seen, for later Huffman
 774          * encoding.
 775          */
 776         for (i = 0; i < 512; i++) {
 777                 leaf_nodes[i] = (struct huffman_node) {
 778                         .symbol = i
 779                 };
 780         }
 781
 782         j = 0;
 783
 784         if (remaining_size < 41 || DEBUG_NO_LZ77_MATCHES) {
 785                 /*
 786                  * There is no point doing a hash table and looking for
 787                  * matches in this tiny block (remembering we are committed to
 788                  * using 32 bits, so there's a good chance we wouldn't even
 789                  * save a byte). The threshold of 41 matches Windows.
 790                  * If remaining_size < 3, we *can't* do the hash.
 791                  */
 792                 i = 0;
 793         } else {
 794                 /*
 795                  * We use 0xffff as the unset value for table, because it is
 796                  * not a valid match offset (and 0x0 is).
 797                  */
 798                 memset(hash_table, 0xff, sizeof(cmp_mem->hash_table1));
 799
 800                 for (i = 0; i <= block_end - 3; i++) {
 801                         uint16_t code;
 802                         const uint8_t *here = data + i;
 803                         uint16_t h = three_byte_hash(here);
 804                         size_t max_len = MIN(remaining_size - i, MAX_MATCH_LENGTH);
 805                         match = lookup_match(hash_table,
 806                                              h,
 807                                              data,
 808                                              here,
 809                                              max_len);
 810
 811                         if (match.there == NULL && prev_hash_table != NULL) {
 812                                 /*
 813                                  * If this is not the first block,
 814                                  * backreferences can look into the previous
 815                                  * block (but only as far as 65535 bytes, so
 816                                  * the end of this block cannot see the start
 817                                  * of the last one).
 818                                  */
 819                                 match = lookup_match(prev_hash_table,
 820                                                      h,
 821                                                      prev_block,
 822                                                      here,
 823                                                      remaining_size - i);
 824                         }
 825
 826                         store_match(hash_table, h, i);
 827
 828                         if (match.there == NULL) {
 829                                 /* add a literal and move on. */
 830                                 uint8_t c = data[i];
 831                                 leaf_nodes[c].count++;
 832                                 intermediate[j] = c;
 833                                 j++;
 834                                 continue;
 835                         }
 836
 837                         /* a real match */
 838                         if (match.length <= 65538) {
 839                                 intermediate[j] = 0xffff;
 840                                 intermediate[j + 1] = match.length - 3;
 841                                 intermediate[j + 2] = here - match.there;
 842                                 j += 3;
 843                         } else {
 844                                 size_t m = match.length - 3;
 845                                 intermediate[j] = 0xfffe;
 846                                 intermediate[j + 1] = m & 0xffff;
 847                                 intermediate[j + 2] = m >> 16;
 848                                 intermediate[j + 3] = here - match.there;
 849                                 j += 4;
 850                         }
 851                         code = encode_match(match.length, here - match.there);
 852                         leaf_nodes[code].count++;
 853                         i += match.length - 1; /* `- 1` for the loop i++ */
 854                         /*
 855                          * A match can take us past the intended block length,
 856                          * extending the block. We don't need to do anything
 857                          * special for this case -- the loops will naturally
 858                          * do the right thing.
 859                          */
 860                 }
 861         }
 862
 863         /*
 864          * There might be some bytes at the end.
 865          */
 866         for (; i < block_end; i++) {
 867                 leaf_nodes[data[i]].count++;
 868                 intermediate[j] = data[i];
 869                 j++;
 870         }
 871
 872         if (i == remaining_size) {
 873                 /* add a trailing EOF marker (256) */
 874                 intermediate[j] = 0xffff;
 875                 intermediate[j + 1] = 0;
 876                 intermediate[j + 2] = 1;
 877                 j += 3;
 878                 leaf_nodes[256].count++;
 879         }
 880
 881         intermediate_len = j;
 882
 883         cmp_ctx->prev_block_pos = cmp_ctx->input_pos;
 884         cmp_ctx->input_pos += i;
 885
 886         /* fill in the symbols table */
 887         n_symbols = generate_huffman_codes(leaf_nodes,
 888                                            cmp_mem->internal_nodes,
 889                                            symbol_values);
 890         if (n_symbols < 0) {
 891                 return n_symbols;
 892         }
 893
 894         return intermediate_len;
 895 }
 896
 897
 898
 899 static ssize_t write_huffman_table(uint16_t symbol_values[512],
 900                                    uint8_t *output,
 901                                    size_t available_size)
 902 {
 903         size_t i;
 904
 905         if (available_size < 256) {
 906                 return LZXPRESS_ERROR;
 907         }
 908
 909         for (i = 0; i < 256; i++) {
 910                 uint8_t b = 0;
 911                 uint16_t even = symbol_values[i * 2];
 912                 uint16_t odd = symbol_values[i * 2 + 1];
 913                 if (even != 0) {
 914                         b = bitlen_nonzero_16(even);
 915                 }
 916                 if (odd != 0) {
 917                         b |= bitlen_nonzero_16(odd) << 4;
 918                 }
 919                 output[i] = b;
 920         }
 921         return i;
 922 }
 923
 924
 925 struct write_context {
 926         uint8_t *dest;
 927         size_t dest_len;
 928         size_t head;                 /* where lengths go */
 929         size_t next_code;            /* where symbol stream goes */
 930         size_t pending_next_code;    /* will be next_code */
 931         unsigned bit_len;
 932         uint32_t bits;
 933 };
 934
 935 /*
 936  * Write out 16 bits, little-endian, for write_huffman_codes()
 937  *
 938  * As you'll notice, there's a bit to do.
 939  *
 940  * We are collecting up bits in a uint32_t, then when there are 16 of them we
 941  * write out a word into the stream, using a trio of offsets (wc->next_code,
 942  * wc->pending_next_code, and wc->head) which dance around ensuring that the
 943  * bitstream and the interspersed lengths are in the right places relative to
 944  * each other.
 945  */
 946
 947 static inline bool write_bits(struct write_context *wc,
 948                               uint16_t code, uint16_t length)
 949 {
 950         wc->bits <<= length;
 951         wc->bits |= code;
 952         wc->bit_len += length;
 953         if (wc->bit_len > 16) {
 954                 uint32_t w = wc->bits >> (wc->bit_len - 16);
 955                 wc->bit_len -= 16;
 956                 if (wc->next_code + 2 > wc->dest_len ||
 957                     unlikely(wc->bit_len > 16)) {
 958                         return false;
 959                 }
 960                 wc->dest[wc->next_code] = w & 0xff;
 961                 wc->dest[wc->next_code + 1] = (w >> 8) & 0xff;
 962                 wc->next_code = wc->pending_next_code;
 963                 wc->pending_next_code = wc->head;
 964                 wc->head += 2;
 965         }
 966         return true;
 967 }
 968
 969
 970 static inline bool write_code(struct write_context *wc, uint16_t code)
 971 {
 972         int code_bit_len = bitlen_nonzero_16(code);
 973         if (unlikely(code == 0)) {
 974                 return false;
 975         }
 976         code &= (1 << code_bit_len) - 1;
 977         return  write_bits(wc, code, code_bit_len);
 978 }
 979
 980 static inline bool write_byte(struct write_context *wc, uint8_t byte)
 981 {
 982         if (wc->head + 1 > wc->dest_len) {
 983                 return false;
 984         }
 985         wc->dest[wc->head] = byte;
 986         wc->head++;
 987         return true;
 988 }
 989
 990
 991 static inline bool write_long_len(struct write_context *wc, size_t len)
 992 {
 993         if (len < 65535) {
 994                 if (wc->head + 3 > wc->dest_len) {
 995                         return false;
 996                 }
 997                 wc->dest[wc->head] = 255;
 998                 wc->dest[wc->head + 1] = len & 255;
 999                 wc->dest[wc->head + 2] = len >> 8;
1000                 wc->head += 3;
1001         } else {
1002                 if (wc->head + 7 > wc->dest_len) {
1003                         return false;
1004                 }
1005                 wc->dest[wc->head] = 255;
1006                 wc->dest[wc->head + 1] = 0;
1007                 wc->dest[wc->head + 2] = 0;
1008                 wc->dest[wc->head + 3] = len & 255;
1009                 wc->dest[wc->head + 4] = (len >> 8) & 255;
1010                 wc->dest[wc->head + 5] = (len >> 16) & 255;
1011                 wc->dest[wc->head + 6] = (len >> 24) & 255;
1012                 wc->head += 7;
1013         }
1014         return true;
1015 }
1016
1017 static ssize_t write_compressed_bytes(uint16_t symbol_values[512],
1018                                       uint16_t *intermediate,
1019                                       size_t intermediate_len,
1020                                       uint8_t *dest,
1021                                       size_t dest_len)
1022 {
1023         bool ok;
1024         size_t i;
1025         size_t end;
1026         struct write_context wc = {
1027                 .head = 4,
1028                 .pending_next_code = 2,
1029                 .dest = dest,
1030                 .dest_len = dest_len
1031         };
1032         for (i = 0; i < intermediate_len; i++) {
1033                 uint16_t c = intermediate[i];
1034                 size_t len;
1035                 uint16_t distance;
1036                 uint16_t code_len = 0;
1037                 uint16_t code_dist = 0;
1038                 if (c < 256) {
1039                         ok = write_code(&wc, symbol_values[c]);
1040                         if (!ok) {
1041                                 return LZXPRESS_ERROR;
1042                         }
1043                         continue;
1044                 }
1045
1046                 if (c == 0xfffe) {
1047                         if (i > intermediate_len - 4) {
1048                                 return LZXPRESS_ERROR;
1049                         }
1050
1051                         len = intermediate[i + 1];
1052                         len |= (uint32_t)intermediate[i + 2] << 16;
1053                         distance = intermediate[i + 3];
1054                         i += 3;
1055                 } else if (c == 0xffff) {
1056                         if (i > intermediate_len - 3) {
1057                                 return LZXPRESS_ERROR;
1058                         }
1059                         len = intermediate[i + 1];
1060                         distance = intermediate[i + 2];
1061                         i += 2;
1062                 } else {
1063                         return LZXPRESS_ERROR;
1064                 }
1065                 if (unlikely(distance == 0)) {
1066                         return LZXPRESS_ERROR;
1067                 }
1068                 /* len has already had 3 subtracted */
1069                 if (len >= 15) {
1070                         /*
1071                          * We are going to need to write extra length
1072                          * bytes into the stream, but we don't do it
1073                          * now, we do it after the code has been
1074                          * written (and before the distance bits).
1075                          */
1076                         code_len = 15;
1077                 } else {
1078                         code_len = len;
1079                 }
1080                 code_dist = bitlen_nonzero_16(distance);
1081                 c = 256 | (code_dist << 4) | code_len;
1082                 if (c > 511) {
1083                         return LZXPRESS_ERROR;
1084                 }
1085
1086                 ok = write_code(&wc, symbol_values[c]);
1087                 if (!ok) {
1088                         return LZXPRESS_ERROR;
1089                 }
1090
1091                 if (code_len == 15) {
1092                         if (len >= 270) {
1093                                 ok = write_long_len(&wc, len);
1094                         } else {
1095                                 ok = write_byte(&wc, len - 15);
1096                         }
1097                         if (! ok) {
1098                                 return LZXPRESS_ERROR;
1099                         }
1100                 }
1101                 if (code_dist != 0) {
1102                         uint16_t dist_bits = distance - (1 << code_dist);
1103                         ok = write_bits(&wc, dist_bits, code_dist);
1104                         if (!ok) {
1105                                 return LZXPRESS_ERROR;
1106                         }
1107                 }
1108         }
1109         /*
1110          * There are some intricacies around flushing the bits and returning
1111          * the length.
1112          *
1113          * If the returned length is not exactly right and there is another
1114          * block, that block will read its huffman table from the wrong place,
1115          * and have all the symbol codes out by a multiple of 4.
1116          */
1117         end = wc.head;
1118         if (wc.bit_len == 0) {
1119                 end -= 2;
1120         }
1121         ok = write_bits(&wc, 0, 16 - wc.bit_len);
1122         if (!ok) {
1123                 return LZXPRESS_ERROR;
1124         }
1125         for (i = 0; i < 2; i++) {
1126                 /*
1127                  * Flush out the bits with zeroes. It doesn't matter if we do
1128                  * a round too many, as we have buffer space, and have already
1129                  * determined the returned length (end).
1130                  */
1131                 ok = write_bits(&wc, 0, 16);
1132                 if (!ok) {
1133                         return LZXPRESS_ERROR;
1134                 }
1135         }
1136         return end;
1137 }
1138
1139
1140 static ssize_t lzx_huffman_compress_block(struct lzxhuff_compressor_context *cmp_ctx,
1141                                           struct lzxhuff_compressor_mem *cmp_mem,
1142                                           size_t block_no)
1143 {
1144         ssize_t intermediate_size;
1145         uint16_t *hash_table = NULL;
1146         uint16_t *back_window_hash_table = NULL;
1147         ssize_t bytes_written;
1148
1149         if (cmp_ctx->available_size - cmp_ctx->output_pos < 260) {
1150                 /* huffman block + 4 bytes */
1151                 return LZXPRESS_ERROR;
1152         }
1153
1154         /*
1155          * For LZ77 compression, we keep a hash table for the previous block,
1156          * via alternation after the first block.
1157          *
1158          * LZ77 writes into the intermediate buffer in the cmp_mem context.
1159          */
1160         if (block_no == 0) {
1161                 hash_table = cmp_mem->hash_table1;
1162                 back_window_hash_table = NULL;
1163         } else if (block_no & 1) {
1164                 hash_table = cmp_mem->hash_table2;
1165                 back_window_hash_table = cmp_mem->hash_table1;
1166         } else {
1167                 hash_table = cmp_mem->hash_table1;
1168                 back_window_hash_table = cmp_mem->hash_table2;
1169         }
1170
1171         intermediate_size = lz77_encode_block(cmp_ctx,
1172                                               cmp_mem,
1173                                               hash_table,
1174                                               back_window_hash_table);
1175
1176         if (intermediate_size < 0) {
1177                 return intermediate_size;
1178         }
1179
1180         /*
1181          * Write the 256 byte Huffman table, based on the counts gained in
1182          * LZ77 phase.
1183          */
1184         bytes_written = write_huffman_table(
1185                 cmp_mem->symbol_values,
1186                 cmp_ctx->output + cmp_ctx->output_pos,
1187                 cmp_ctx->available_size - cmp_ctx->output_pos);
1188
1189         if (bytes_written != 256) {
1190                 return LZXPRESS_ERROR;
1191         }
1192         cmp_ctx->output_pos += 256;
1193
1194         /*
1195          * Write the compressed bytes using the LZ77 matches and Huffman codes
1196          * worked out in the previous steps.
1197          */
1198         bytes_written = write_compressed_bytes(
1199                 cmp_mem->symbol_values,
1200                 cmp_mem->intermediate,
1201                 intermediate_size,
1202                 cmp_ctx->output + cmp_ctx->output_pos,
1203                 cmp_ctx->available_size - cmp_ctx->output_pos);
1204
1205         if (bytes_written < 0) {
1206                 return bytes_written;
1207         }
1208
1209         cmp_ctx->output_pos += bytes_written;
1210         return bytes_written;
1211 }
1212
1213 /*
1214  * lzxpress_huffman_max_compressed_size()
1215  *
1216  * Return the most bytes the compression can take, to allow
1217  * pre-allocation.
1218  */
1219 size_t lzxpress_huffman_max_compressed_size(size_t input_size)
1220 {
1221         /*
1222          * In the worst case, the output size should be about the same as the
1223          * input size, plus the 256 byte header per 64k block. We aim for
1224          * ample, but within the order of magnitude.
1225          */
1226         return input_size + (input_size / 8) + 270;
1227 }
1228
1229 /*
1230  * lzxpress_huffman_compress_talloc()
1231  *
1232  * This is the convenience function that allocates the compressor context and
1233  * output memory for you. The return value is the number of bytes written to
1234  * the location indicated by the output pointer.
1235  *
1236  * The maximum input_size is effectively around 227MB due to the need to guess
1237  * an upper bound on the output size that hits an internal limitation in
1238  * talloc.
1239  *
1240  * @param mem_ctx      TALLOC_CTX parent for the compressed buffer.
1241  * @param input_bytes  memory to be compressed.
1242  * @param input_size   length of the input buffer.
1243  * @param output       destination pointer for the compressed data.
1244  *
1245  * @return the number of bytes written or -1 on error.
1246  */
1247
1248 ssize_t lzxpress_huffman_compress_talloc(TALLOC_CTX *mem_ctx,
1249                                          const uint8_t *input_bytes,
1250                                          size_t input_size,
1251                                          uint8_t **output)
1252 {
1253         struct lzxhuff_compressor_mem *cmp = NULL;
1254         size_t alloc_size = lzxpress_huffman_max_compressed_size(input_size);
1255
1256         ssize_t output_size;
1257
1258         *output = talloc_array(mem_ctx, uint8_t, alloc_size);
1259         if (*output == NULL) {
1260                 return LZXPRESS_ERROR;
1261         }
1262
1263         cmp = talloc(mem_ctx, struct lzxhuff_compressor_mem);
1264         if (cmp == NULL) {
1265                 TALLOC_FREE(*output);
1266                 return LZXPRESS_ERROR;
1267         }
1268
1269         output_size = lzxpress_huffman_compress(cmp,
1270                                                 input_bytes,
1271                                                 input_size,
1272                                                 *output,
1273                                                 alloc_size);
1274
1275         talloc_free(cmp);
1276
1277         if (output_size < 0) {
1278                 TALLOC_FREE(*output);
1279                 return LZXPRESS_ERROR;
1280         }
1281
1282         *output = talloc_realloc(mem_ctx, *output, uint8_t, output_size);
1283         if (*output == NULL) {
1284                 return LZXPRESS_ERROR;
1285         }
1286
1287         return output_size;
1288 }
1289
1290 /*
1291  * lzxpress_huffman_compress()
1292  *
1293  * This is the inconvenience function, slightly faster and fiddlier than
1294  * lzxpress_huffman_compress_talloc().
1295  *
1296  * To use this, you need to have allocated (but not initialised) a `struct
1297  * lzxhuff_compressor_mem`, and an output buffer. If the buffer is not big
1298  * enough (per `output_size`), you'll get a negative return value, otherwise
1299  * the number of bytes actually consumed, which will always be at least 260.
1300  *
1301  * The `struct lzxhuff_compressor_mem` is reusable -- it is basically a
1302  * collection of uninitialised memory buffers. The total size is less than
1303  * 150k, so stack allocation is plausible.
1304  *
1305  * input_size and available_size are limited to the minimum of UINT32_MAX and
1306  * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
1307  *
1308  * @param cmp_mem         a struct lzxhuff_compressor_mem.
1309  * @param input_bytes     memory to be compressed.
1310  * @param input_size      length of the input buffer.
1311  * @param output          destination for the compressed data.
1312  * @param available_size  allocated output bytes.
1313  *
1314  * @return the number of bytes written or -1 on error.
1315  */
1316 ssize_t lzxpress_huffman_compress(struct lzxhuff_compressor_mem *cmp_mem,
1317                                   const uint8_t *input_bytes,
1318                                   size_t input_size,
1319                                   uint8_t *output,
1320                                   size_t available_size)
1321 {
1322         size_t i = 0;
1323         struct lzxhuff_compressor_context cmp_ctx = {
1324                 .input_bytes = input_bytes,
1325                 .input_size = input_size,
1326                 .input_pos = 0,
1327                 .prev_block_pos = 0,
1328                 .output = output,
1329                 .available_size = available_size,
1330                 .output_pos = 0
1331         };
1332
1333         if (input_size == 0) {
1334                 /*
1335                  * We can't deal with this for a number of reasons (e.g. it
1336                  * breaks the Huffman tree), and the output will be infinitely
1337                  * bigger than the input. The caller needs to go and think
1338                  * about what they're trying to do here.
1339                  */
1340                 return LZXPRESS_ERROR;
1341         }
1342
1343         if (input_size > SSIZE_MAX ||
1344             input_size > UINT32_MAX ||
1345             available_size > SSIZE_MAX ||
1346             available_size > UINT32_MAX ||
1347             available_size == 0) {
1348                 /*
1349                  * We use negative ssize_t to return errors, which is limiting
1350                  * on 32 bit machines; otherwise we adhere to Microsoft's 4GB
1351                  * limit.
1352                  *
1353                  * lzxpress_huffman_compress_talloc() will not get this far,
1354                  * having already have failed on talloc's 256 MB limit.
1355                  */
1356                 return LZXPRESS_ERROR;
1357         }
1358
1359         if (cmp_mem == NULL ||
1360             output == NULL ||
1361             input_bytes == NULL) {
1362                 return LZXPRESS_ERROR;
1363         }
1364
1365         while (cmp_ctx.input_pos < cmp_ctx.input_size) {
1366                 ssize_t ret;
1367                 ret = lzx_huffman_compress_block(&cmp_ctx,
1368                                                  cmp_mem,
1369                                                  i);
1370                 if (ret < 0) {
1371                         return ret;
1372                 }
1373                 i++;
1374         }
1375
1376         return cmp_ctx.output_pos;
1377 }
1378
1379 static void debug_tree_codes(struct bitstream *input)
1380 {
1381         /*
1382          */
1383         size_t head = 0;
1384         size_t tail = 2;
1385         size_t ffff_count = 0;
1386         struct q {
1387                 uint16_t tree_code;
1388                 uint16_t code_code;
1389         };
1390         struct q queue[65536];
1391         char bits[17];
1392         uint16_t *t = input->table;
1393         queue[0].tree_code = 1;
1394         queue[0].code_code = 2;
1395         queue[1].tree_code = 2;
1396         queue[1].code_code = 3;
1397         while (head < tail) {
1398                 struct q q = queue[head];
1399                 uint16_t x = t[q.tree_code];
1400                 if (x != 0xffff) {
1401                         int k;
1402                         uint16_t j = q.code_code;
1403                         size_t offset = bitlen_nonzero_16(j) - 1;
1404                         if (unlikely(j == 0)) {
1405                                 DBG("BROKEN code is 0!\n");
1406                                 return;
1407                         }
1408
1409                         for (k = 0; k <= offset; k++) {
1410                                 bool b = (j >> (offset - k)) & 1;
1411                                 bits[k] = b ? '1' : '0';
1412                         }
1413                         bits[k] = 0;
1414                         DBG("%03x   %s\n", x & 511, bits);
1415                         head++;
1416                         continue;
1417                 }
1418                 ffff_count++;
1419                 queue[tail].tree_code = q.tree_code * 2 + 1;
1420                 queue[tail].code_code = q.code_code * 2;
1421                 tail++;
1422                 queue[tail].tree_code = q.tree_code * 2 + 1 + 1;
1423                 queue[tail].code_code = q.code_code * 2 + 1;
1424                 tail++;
1425                 head++;
1426         }
1427         DBG("0xffff count: %zu\n", ffff_count);
1428 }
1429
1430 /**
1431  * Determines the sort order of one prefix_code_symbol relative to another
1432  */
1433 static int compare_uint16(const uint16_t *a, const uint16_t *b)
1434 {
1435         if (*a < *b) {
1436                 return -1;
1437         }
1438         if (*a > *b) {
1439                 return 1;
1440         }
1441         return 0;
1442 }
1443
1444
1445 static bool fill_decomp_table(struct bitstream *input)
1446 {
1447         /*
1448          * There are 512 symbols, each encoded in 4 bits, which indicates
1449          * their depth in the Huffman tree. The even numbers get the lower
1450          * nibble of each byte, so that the byte hex values look backwards
1451          * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
1452          * order of appearance, per depth.
1453          *
1454          * For example, if the first two bytes were:
1455          *
1456          * 0x23 0x53
1457          *
1458          * the first four codes have the lengths 3, 2, 3, 5.
1459          * Let's call them A, B, C, D.
1460          *
1461          * Suppose there is no other codeword with length 1 (which is
1462          * necessarily true in this example) or 2, but there might be others
1463          * of length 3 or 4. Then we can say this about the codes:
1464          *
1465          *        _ --*--_
1466          *      /          \
1467          *     0           1
1468          *    / \         / \
1469          *   0   1       0   1
1470          *  B    |\     / \  |\
1471          *       0 1   0   1 0 1
1472          *       A C   |\ /| | |\
1473          *
1474          * pos bits  code
1475          * A    3    010
1476          * B    2    00
1477          * C    3    011
1478          * D    5    1????
1479          *
1480          * B has the shortest code, so takes the leftmost branch, 00. That
1481          * ends the branch -- nothing else can start with 00. There are no
1482          * more 2s, so we look at the 3s, starting as far left as possible. So
1483          * A takes 010 and C takes 011. That means everything else has to
1484          * start with 1xx. We don't know how many codewords of length 3 or 4
1485          * there are; if there are none, D would end up with 10000, the
1486          * leftmost available code of length 5. If the compressor is any good,
1487          * there should be no unused leaf nodes left dangling at the end.
1488          *
1489          * (this is "Canonical Huffman Coding").
1490          *
1491          *
1492          * But what symbols do these codes actually stand for?
1493          * --------------------------------------------------
1494          *
1495          * Good question. The first 256 codes stand for the corresponding
1496          * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
1497          * which have a distance and a length, encoded in a strange way that
1498          * isn't entirely the purview of this function.
1499          *
1500          * What does the value 0 mean?
1501          * ---------------------------
1502          *
1503          * The code does not occur. For example, if the next byte in the
1504          * example above was 0x07, that would give the byte 0x04 a 7-long
1505          * code, and no code to the 0x05 byte, which means we there is no way
1506          * we going to see a 5 in the decoded stream.
1507          *
1508          * Isn't LZ77 + Huffman what zip/gzip/zlib do?
1509          * -------------------------------------------
1510          *
1511          * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
1512          */
1513         uint16_t symbols[512];
1514         uint16_t sort_mem[512];
1515         size_t i, n_symbols;
1516         ssize_t code;
1517         uint16_t len = 0, prev_len;
1518         const uint8_t *table_bytes = input->bytes + input->byte_pos;
1519
1520         if (input->byte_pos + 260 > input->byte_size) {
1521                 return false;
1522         }
1523
1524         n_symbols = 0;
1525         for (i = 0; i < 256; i++) {
1526                 uint16_t even = table_bytes[i] & 15;
1527                 uint16_t odd = table_bytes[i] >> 4;
1528                 if (even != 0) {
1529                         symbols[n_symbols] = (even << 9) + i * 2;
1530                         n_symbols++;
1531                 }
1532                 if (odd != 0) {
1533                         symbols[n_symbols] = (odd << 9) + i * 2 + 1;
1534                         n_symbols++;
1535                 }
1536         }
1537         input->byte_pos += 256;
1538         if (n_symbols == 0) {
1539                 return false;
1540         }
1541
1542         stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
1543                     (samba_compare_fn_t)compare_uint16);
1544
1545         /*
1546          * we're using an implicit binary tree, as you'd see in a heap.
1547          * table[0] = unused
1548          * table[1] = '0'
1549          * table[2] = '1'
1550          * table[3] = '00'     <-- '00' and '01' are children of '0'
1551          * table[4] = '01'     <-- '0' is [0], children are [0 * 2 + {1,2}]
1552          * table[5] = '10'
1553          * table[6] = '11'
1554          * table[7] = '000'
1555          * table[8] = '001'
1556          * table[9] = '010'
1557          * table[10]= '011'
1558          * table[11]= '100
1559          *'
1560          * table[1 << n - 1] = '0' * n
1561          * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
1562          * table[1 << n - 2] = '1' * (n - 1)
1563          *
1564          * table[i]->left =  table[i*2 + 1]
1565          * table[i]->right = table[i*2 + 2]
1566          * table[0xffff] = unused (16 '0's, max len is 15)
1567          *
1568          * therefore e.g. table[70] = table[64     - 1 + 7]
1569          *                          = table[1 << 6 - 1 + 7]
1570          *                          = '000111' (binary 7, widened to 6 bits)
1571          *
1572          *   and if '000111' is a code,
1573          *   '00011', '0001', '000', '00', '0' are unavailable prefixes.
1574          *       34      16      7     3    1  are their indices
1575          *   and (i - 1) >> 1 is the path back from 70 through these.
1576          *
1577          * the lookup is
1578          *
1579          * 1 start with i = 0
1580          * 2 extract a symbol bit (i = (i << 1) + bit + 1)
1581          * 3 is table[i] == 0xffff?
1582          * 4  yes -- goto 2
1583          * 4  table[i] & 511 is the symbol, stop
1584          *
1585          * and the construction (here) is sort of the reverse.
1586          *
1587          * Most of this table is free space that can never be reached, and
1588          * most of the activity is at the beginning (since all codes start
1589          * there, and by design the shortest codes are the most common).
1590          */
1591         for (i = 0; i < 32; i++) {
1592                 /* prefill the table head */
1593                 input->table[i] = 0xffff;
1594         }
1595         code = -1;
1596         prev_len = 0;
1597         for (i = 0; i < n_symbols; i++) {
1598                 uint16_t s = symbols[i];
1599                 uint16_t prefix;
1600                 len = (s >> 9) & 15;
1601                 s &= 511;
1602                 code++;
1603                 while (len != prev_len) {
1604                         code <<= 1;
1605                         code++;
1606                         prev_len++;
1607                 }
1608
1609                 if (code >= 65535) {
1610                         return false;
1611                 }
1612                 input->table[code] = s;
1613                 for(prefix = (code - 1) >> 1;
1614                     prefix > 31;
1615                     prefix = (prefix - 1) >> 1) {
1616                         input->table[prefix] = 0xffff;
1617                 }
1618         }
1619         if (CHECK_DEBUGLVL(10)) {
1620                 debug_tree_codes(input);
1621         }
1622
1623         /*
1624          * check that the last code encodes 11111..., with right number of
1625          * ones, pointing to the right symbol -- otherwise we have a dangling
1626          * uninitialised symbol.
1627          */
1628         if (code != (1 << (len + 1)) - 2) {
1629                 return false;
1630         }
1631         return true;
1632 }
1633
1634
1635 #define CHECK_READ_32(dest)                                       \
1636         do {                                                      \
1637                 if (input->byte_pos + 4 > input->byte_size) {     \
1638                         return LZXPRESS_ERROR;                     \
1639                 }                                                  \
1640                 dest = PULL_LE_U32(input->bytes, input->byte_pos); \
1641                 input->byte_pos += 4;                              \
1642         } while (0)
1643
1644 #define CHECK_READ_16(dest)                                       \
1645         do {                                                      \
1646                 if (input->byte_pos + 2 > input->byte_size) {     \
1647                         return LZXPRESS_ERROR;                     \
1648                 }                                                  \
1649                 dest = PULL_LE_U16(input->bytes, input->byte_pos); \
1650                 input->byte_pos += 2;                              \
1651         } while (0)
1652
1653 #define CHECK_READ_8(dest) \
1654         do {                                                            \
1655                 if (input->byte_pos >= input->byte_size) {              \
1656                         return LZXPRESS_ERROR;                          \
1657                 }                                                       \
1658                 dest = PULL_LE_U8(input->bytes, input->byte_pos);       \
1659                 input->byte_pos++;                                      \
1660         } while(0)
1661
1662
1663 static inline ssize_t pull_bits(struct bitstream *input)
1664 {
1665         if (input->byte_pos + 1 < input->byte_size) {
1666                 uint16_t tmp;
1667                 CHECK_READ_16(tmp);
1668                 input->remaining_bits += 16;
1669                 input->bits <<= 16;
1670                 input->bits |= tmp;
1671         } else if (input->byte_pos < input->byte_size) {
1672                 uint8_t tmp;
1673                 CHECK_READ_8(tmp);
1674                 input->remaining_bits += 8;
1675                 input->bits <<= 8;
1676                 input->bits |= tmp;
1677         } else {
1678                 return LZXPRESS_ERROR;
1679         }
1680         return 0;
1681 }
1682
1683
1684 /*
1685  * Decompress a block. The actual decompressed size is returned (or -1 on
1686  * error). The putative block length is 64k (or shorter, if the message ends
1687  * first), but a match can run over the end, extending the block. That's why
1688  * we need the overall output size as well as the block size. A match encoded
1689  * in this block can point back to previous blocks, but not before the
1690  * beginning of the message, so we also need the previously decoded size.
1691  *
1692  * The compressed block will have 256 bytes for the Huffman table, and at
1693  * least 4 bytes of (possibly padded) encoded values.
1694  */
1695 static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
1696                                             uint8_t *output,
1697                                             size_t block_size,
1698                                             size_t output_size,
1699                                             size_t previous_size)
1700 {
1701         size_t output_pos = 0;
1702         uint16_t symbol;
1703         size_t index;
1704         uint16_t distance_bits_wanted = 0;
1705         size_t distance = 0;
1706         size_t length = 0;
1707         bool ok;
1708         uint32_t tmp;
1709         bool seen_eof_marker = false;
1710
1711         ok = fill_decomp_table(input);
1712         if (! ok) {
1713                 return LZXPRESS_ERROR;
1714         }
1715         if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
1716                 debug_huffman_tree_from_table(input->table);
1717         }
1718         /*
1719          * Always read 32 bits at the start, even if we don't need them.
1720          */
1721         CHECK_READ_16(tmp);
1722         CHECK_READ_16(input->bits);
1723         input->bits |= tmp << 16;
1724         input->remaining_bits = 32;
1725
1726         /*
1727          * This loop iterates over individual *bits*. These are read from
1728          * little-endian 16 bit words, most significant bit first.
1729          *
1730          * At points in the bitstream, the following are possible:
1731          *
1732          * # the source word is empty and needs to be refilled from the input
1733          *    stream.
1734          * # an incomplete codeword is being extended.
1735          * # a codeword is resolved, either as a literal or a match.
1736          * # a literal is written.
1737          * # a match is collecting distance bits.
1738          * # the output stream is copied, as specified by a match.
1739          * # input bytes are read for match lengths.
1740          *
1741          * Note that we *don't* specifically check for the EOF marker (symbol
1742          * 256) in this loop, because the precondition for stopping for the
1743          * EOF marker is that the output buffer is full (otherwise, you
1744          * wouldn't know which 256 is EOF, rather than an actual symbol), and
1745          * we *always* want to stop when the buffer is full. So we work out if
1746          * there is an EOF in another loop after we stop writing.
1747          */
1748
1749         index = 0;
1750         while (output_pos < block_size) {
1751                 uint16_t b;
1752                 if (input->remaining_bits == 16) {
1753                         ssize_t ret = pull_bits(input);
1754                         if (ret) {
1755                                 return ret;
1756                         }
1757                 }
1758                 input->remaining_bits--;
1759
1760                 b = (input->bits >> input->remaining_bits) & 1;
1761                 if (length == 0) {
1762                         /* not in a match; pulling a codeword */
1763                         index <<= 1;
1764                         index += b + 1;
1765                         if (input->table[index] == 0xffff) {
1766                                 /* incomplete codeword, the common case */
1767                                 continue;
1768                         }
1769                         /* found the symbol, reset the code string */
1770                         symbol = input->table[index] & 511;
1771                         index = 0;
1772                         if (symbol < 256) {
1773                                 /* a literal, the easy case */
1774                                 output[output_pos] = symbol;
1775                                 output_pos++;
1776                                 continue;
1777                         }
1778
1779                         /* the beginning of a match */
1780                         distance_bits_wanted = (symbol >> 4) & 15;
1781                         distance = 1 << distance_bits_wanted;
1782                         length = symbol & 15;
1783                         if (length == 15) {
1784                                 CHECK_READ_8(tmp);
1785                                 length += tmp;
1786                                 if (length == 255 + 15) {
1787                                         /*
1788                                          * note, we discard (don't add) the
1789                                          * length so far.
1790                                          */
1791                                         CHECK_READ_16(length);
1792                                         if (length == 0) {
1793                                                 CHECK_READ_32(length);
1794                                         }
1795                                 }
1796                         }
1797                         length += 3;
1798                 } else {
1799                         /* we are pulling extra distance bits */
1800                         distance_bits_wanted--;
1801                         distance |= b << distance_bits_wanted;
1802                 }
1803
1804                 if (distance_bits_wanted == 0) {
1805                         /*
1806                          * We have a complete match, and it is time to do the
1807                          * copy (byte by byte, because the ranges can overlap,
1808                          * and we might need to copy bytes we just copied in).
1809                          *
1810                          * It is possible that this match will extend beyond
1811                          * the end of the expected block. That's fine, so long
1812                          * as it doesn't extend past the total output size.
1813                          */
1814                         size_t i;
1815                         size_t end = output_pos + length;
1816                         uint8_t *here = output + output_pos;
1817                         uint8_t *there = here - distance;
1818                         if (end > output_size ||
1819                             previous_size + output_pos < distance ||
1820                             unlikely(end < output_pos || there > here)) {
1821                                 return LZXPRESS_ERROR;
1822                         }
1823                         for (i = 0; i < length; i++) {
1824                                 here[i] = there[i];
1825                         }
1826                         output_pos += length;
1827                         distance = 0;
1828                         length = 0;
1829                 }
1830         }
1831
1832         if (length != 0 || index != 0) {
1833                 /* it seems like we've hit an early end, mid-code */
1834                 return LZXPRESS_ERROR;
1835         }
1836
1837         if (input->byte_pos + 256 < input->byte_size) {
1838                 /*
1839                  * This block is over, but it clearly isn't the last block, so
1840                  * we don't want to look for the EOF.
1841                  */
1842                 return output_pos;
1843         }
1844         /*
1845          * We won't write any more, but we try to read some more to make sure
1846          * we're finishing in a good place. That means we want to see a 256
1847          * symbol and then some number of zeroes, possibly zero, but as many
1848          * as 32.
1849          *
1850          * In this we are perhaps a bit stricter than Windows, which
1851          * apparently does not insist on the EOF marker, nor on a lack of
1852          * trailing bytes.
1853          */
1854         while (true) {
1855                 uint16_t b;
1856                 if (input->remaining_bits == 16) {
1857                         ssize_t ret;
1858                         if (input->byte_pos == input->byte_size) {
1859                                 /* FIN */
1860                                 break;
1861                         }
1862                         ret = pull_bits(input);
1863                         if (ret) {
1864                                 return ret;
1865                         }
1866                 }
1867                 input->remaining_bits--;
1868                 b = (input->bits >> input->remaining_bits) & 1;
1869                 if (seen_eof_marker) {
1870                         /*
1871                          * we have read an EOF symbols. Now we just want to
1872                          * see zeroes.
1873                          */
1874                         if (b != 0) {
1875                                 return LZXPRESS_ERROR;
1876                         }
1877                         continue;
1878                 }
1879
1880                 /* we're pulling in a symbol, which had better be 256 */
1881                 index <<= 1;
1882                 index += b + 1;
1883                 if (input->table[index] == 0xffff) {
1884                         continue;
1885                 }
1886
1887                 symbol = input->table[index] & 511;
1888                 if (symbol != 256) {
1889                         return LZXPRESS_ERROR;
1890                 }
1891                 seen_eof_marker = true;
1892                 continue;
1893         }
1894
1895         if (! seen_eof_marker) {
1896                 return LZXPRESS_ERROR;
1897         }
1898
1899         return output_pos;
1900 }
1901
1902 static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
1903                                                     uint8_t *output,
1904                                                     size_t output_size)
1905 {
1906         size_t output_pos = 0;
1907
1908         if (input->byte_size < 260) {
1909                 return LZXPRESS_ERROR;
1910         }
1911
1912         while (input->byte_pos < input->byte_size) {
1913                 ssize_t block_output_pos;
1914                 ssize_t block_output_size;
1915                 size_t remaining_output_size = output_size - output_pos;
1916
1917                 block_output_size = MIN(65536, remaining_output_size);
1918
1919                 block_output_pos = lzx_huffman_decompress_block(
1920                         input,
1921                         output + output_pos,
1922                         block_output_size,
1923                         remaining_output_size,
1924                         output_pos);
1925
1926                 if (block_output_pos < block_output_size) {
1927                         return LZXPRESS_ERROR;
1928                 }
1929                 output_pos += block_output_pos;
1930                 if (output_pos > output_size) {
1931                         /* not expecting to get here. */
1932                         return LZXPRESS_ERROR;
1933                 }
1934         }
1935
1936         if (input->byte_pos != input->byte_size) {
1937                 return LZXPRESS_ERROR;
1938         }
1939
1940         return output_pos;
1941 }
1942
1943
1944 /*
1945  * lzxpress_huffman_decompress()
1946  *
1947  * output_size must be the expected length of the decompressed data.
1948  * input_size and output_size are limited to the minimum of UINT32_MAX and
1949  * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
1950  *
1951  * @param input_bytes  memory to be decompressed.
1952  * @param input_size   length of the compressed buffer.
1953  * @param output       destination for the decompressed data.
1954  * @param output_size  exact expected length of the decompressed data.
1955  *
1956  * @return the number of bytes written or -1 on error.
1957  */
1958
1959 ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
1960                                     size_t input_size,
1961                                     uint8_t *output,
1962                                     size_t output_size)
1963 {
1964         uint16_t table[65536];
1965         struct bitstream input = {
1966                 .bytes = input_bytes,
1967                 .byte_size = input_size,
1968                 .byte_pos = 0,
1969                 .bits = 0,
1970                 .remaining_bits = 0,
1971                 .table = table
1972         };
1973
1974         if (input_size > SSIZE_MAX ||
1975             input_size > UINT32_MAX ||
1976             output_size > SSIZE_MAX ||
1977             output_size > UINT32_MAX ||
1978             input_size == 0 ||
1979             output_size == 0 ||
1980             input_bytes == NULL ||
1981             output == NULL) {
1982                 /*
1983                  * We use negative ssize_t to return errors, which is limiting
1984                  * on 32 bit machines, and the 4GB limit exists on Windows.
1985                  */
1986                 return  LZXPRESS_ERROR;
1987         }
1988
1989         return lzxpress_huffman_decompress_internal(&input,
1990                                                     output,
1991                                                     output_size);
1992 }
1993
1994
1995 /**
1996  * lzxpress_huffman_decompress_talloc()
1997  *
1998  * The caller must provide the exact size of the expected output.
1999  *
2000  * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
2001  * output_size is limited to 256MB due to a limit in talloc. This effectively
2002  * limits input_size too, as non-crafted compressed data will not exceed the
2003  * decompressed size by very much.
2004  *
2005  * @param mem_ctx      TALLOC_CTX parent for the decompressed buffer.
2006  * @param input_bytes  memory to be decompressed.
2007  * @param input_size   length of the compressed buffer.
2008  * @param output_size  expected decompressed size.
2009  *
2010  * @return a talloc'ed buffer exactly output_size in length, or NULL.
2011  */
2012
2013 uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
2014                                             const uint8_t *input_bytes,
2015                                             size_t input_size,
2016                                             size_t output_size)
2017 {
2018         ssize_t result;
2019         uint8_t *output = NULL;
2020         struct bitstream input = {
2021                 .bytes = input_bytes,
2022                 .byte_size = input_size
2023         };
2024
2025         output = talloc_array(mem_ctx, uint8_t, output_size);
2026         if (output == NULL) {
2027                 return NULL;
2028         }
2029
2030         input.table = talloc_array(mem_ctx, uint16_t, 65536);
2031         if (input.table == NULL) {
2032                 talloc_free(output);
2033                 return NULL;
2034         }
2035         result = lzxpress_huffman_decompress_internal(&input,
2036                                                       output,
2037                                                       output_size);
2038         talloc_free(input.table);
2039
2040         if (result != output_size) {
2041                 talloc_free(output);
2042                 return NULL;
2043         }
2044         return output;
2045 }