Update copyrights to 2021, using "make update-copyright"
[tor.git] / src / lib / compress / compress.c
blob83e63905ccf2315471aaf7c58184d7b6c41b104a
1 /* Copyright (c) 2004, Roger Dingledine.
2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3 * Copyright (c) 2007-2021, The Tor Project, Inc. */
4 /* See LICENSE for licensing information */
6 /**
7 * \file compress.c
8 * \brief Common compression API implementation.
10 * This file provides a unified interface to all the compression libraries Tor
11 * knows how to use.
12 **/
14 #include "orconfig.h"
16 #include <stdlib.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include "lib/cc/torint.h"
21 #ifdef HAVE_NETINET_IN_H
22 #include <netinet/in.h>
23 #endif
25 #include "lib/log/log.h"
26 #include "lib/log/util_bug.h"
27 #include "lib/arch/bytes.h"
28 #include "lib/ctime/di_ops.h"
29 #include "lib/compress/compress.h"
30 #include "lib/compress/compress_lzma.h"
31 #include "lib/compress/compress_none.h"
32 #include "lib/compress/compress_sys.h"
33 #include "lib/compress/compress_zlib.h"
34 #include "lib/compress/compress_zstd.h"
35 #include "lib/intmath/cmp.h"
36 #include "lib/malloc/malloc.h"
37 #include "lib/subsys/subsys.h"
38 #include "lib/thread/threads.h"
40 /** Total number of bytes allocated for compression state overhead. */
41 static atomic_counter_t total_compress_allocation;
43 /** @{ */
44 /* These macros define the maximum allowable compression factor. Anything of
45 * size greater than CHECK_FOR_COMPRESSION_BOMB_AFTER is not allowed to
46 * have an uncompression factor (uncompressed size:compressed size ratio) of
47 * any greater than MAX_UNCOMPRESSION_FACTOR.
49 * Picking a value for MAX_UNCOMPRESSION_FACTOR is a trade-off: we want it to
50 * be small to limit the attack multiplier, but we also want it to be large
51 * enough so that no legitimate document --even ones we might invent in the
52 * future -- ever compresses by a factor of greater than
53 * MAX_UNCOMPRESSION_FACTOR. Within those parameters, there's a reasonably
54 * large range of possible values. IMO, anything over 8 is probably safe; IMO
55 * anything under 50 is probably sufficient.
57 #define MAX_UNCOMPRESSION_FACTOR 25
58 #define CHECK_FOR_COMPRESSION_BOMB_AFTER (1024*64)
59 /** @} */
61 /** Return true if uncompressing an input of size <b>in_size</b> to an input of
62 * size at least <b>size_out</b> looks like a compression bomb. */
63 MOCK_IMPL(int,
64 tor_compress_is_compression_bomb,(size_t size_in, size_t size_out))
66 if (size_in == 0 || size_out < CHECK_FOR_COMPRESSION_BOMB_AFTER)
67 return 0;
69 return (size_out / size_in > MAX_UNCOMPRESSION_FACTOR);
72 /** Guess the size that <b>in_len</b> will be after compression or
73 * decompression. */
74 static size_t
75 guess_compress_size(int compress, compress_method_t method,
76 compression_level_t compression_level,
77 size_t in_len)
79 // ignore these for now.
80 (void)compression_level;
81 if (method == NO_METHOD) {
82 /* Guess that we'll need an extra byte, to avoid a needless realloc
83 * for nul-termination */
84 return (in_len < SIZE_MAX) ? in_len + 1 : in_len;
87 /* Always guess a factor of 2. */
88 if (compress) {
89 in_len /= 2;
90 } else {
91 if (in_len < SIZE_T_CEILING/2)
92 in_len *= 2;
94 return MAX(in_len, 1024);
97 /** Internal function to implement tor_compress/tor_uncompress, depending on
98 * whether <b>compress</b> is set. All arguments are as for tor_compress or
99 * tor_uncompress. */
100 static int
101 tor_compress_impl(int compress,
102 char **out, size_t *out_len,
103 const char *in, size_t in_len,
104 compress_method_t method,
105 compression_level_t compression_level,
106 int complete_only,
107 int protocol_warn_level)
109 tor_compress_state_t *stream;
110 int rv;
112 stream = tor_compress_new(compress, method, compression_level);
114 if (stream == NULL) {
115 log_warn(LD_GENERAL, "NULL stream while %scompressing",
116 compress?"":"de");
117 log_debug(LD_GENERAL, "method: %d level: %d at len: %lu",
118 method, compression_level, (unsigned long)in_len);
119 return -1;
122 size_t in_len_orig = in_len;
123 size_t out_remaining, out_alloc;
124 char *outptr;
126 out_remaining = out_alloc =
127 guess_compress_size(compress, method, compression_level, in_len);
128 *out = outptr = tor_malloc(out_remaining);
130 const int finish = complete_only || compress;
132 while (1) {
133 switch (tor_compress_process(stream,
134 &outptr, &out_remaining,
135 &in, &in_len, finish)) {
136 case TOR_COMPRESS_DONE:
137 if (in_len == 0 || compress) {
138 goto done;
139 } else {
140 // More data is present, and we're decompressing. So we may need to
141 // reinitialize the stream if we are handling multiple concatenated
142 // inputs.
143 tor_compress_free(stream);
144 stream = tor_compress_new(compress, method, compression_level);
145 if (stream == NULL) {
146 log_warn(LD_GENERAL, "NULL stream while %scompressing",
147 compress?"":"de");
148 goto err;
151 break;
152 case TOR_COMPRESS_OK:
153 if (compress || complete_only) {
154 log_fn(protocol_warn_level, LD_PROTOCOL,
155 "Unexpected %s while %scompressing",
156 complete_only?"end of input":"result",
157 compress?"":"de");
158 log_debug(LD_GENERAL, "method: %d level: %d at len: %lu",
159 method, compression_level, (unsigned long)in_len);
160 goto err;
161 } else {
162 if (in_len == 0) {
163 goto done;
166 break;
167 case TOR_COMPRESS_BUFFER_FULL: {
168 if (!compress && outptr < *out+out_alloc) {
169 // A buffer error in this case means that we have a problem
170 // with our input.
171 log_fn(protocol_warn_level, LD_PROTOCOL,
172 "Possible truncated or corrupt compressed data");
173 goto err;
175 if (out_alloc >= SIZE_T_CEILING / 2) {
176 log_warn(LD_GENERAL, "While %scompressing data: ran out of space.",
177 compress?"":"un");
178 goto err;
180 if (!compress &&
181 tor_compress_is_compression_bomb(in_len_orig, out_alloc)) {
182 // This should already have been caught down in the backend logic.
183 // LCOV_EXCL_START
184 tor_assert_nonfatal_unreached();
185 goto err;
186 // LCOV_EXCL_STOP
188 const size_t offset = outptr - *out;
189 out_alloc *= 2;
190 *out = tor_realloc(*out, out_alloc);
191 outptr = *out + offset;
192 out_remaining = out_alloc - offset;
193 break;
195 case TOR_COMPRESS_ERROR:
196 log_fn(protocol_warn_level, LD_GENERAL,
197 "Error while %scompressing data: bad input?",
198 compress?"":"un");
199 goto err; // bad data.
201 // LCOV_EXCL_START
202 default:
203 tor_assert_nonfatal_unreached();
204 goto err;
205 // LCOV_EXCL_STOP
208 done:
209 *out_len = outptr - *out;
210 if (compress && tor_compress_is_compression_bomb(*out_len, in_len_orig)) {
211 log_warn(LD_BUG, "We compressed something and got an insanely high "
212 "compression factor; other Tors would think this was a "
213 "compression bomb.");
214 goto err;
216 if (!compress) {
217 // NUL-terminate our output.
218 if (out_alloc == *out_len)
219 *out = tor_realloc(*out, out_alloc + 1);
220 (*out)[*out_len] = '\0';
222 rv = 0;
223 goto out;
225 err:
226 tor_free(*out);
227 *out_len = 0;
228 rv = -1;
229 goto out;
231 out:
232 tor_compress_free(stream);
233 return rv;
236 /** Given <b>in_len</b> bytes at <b>in</b>, compress them into a newly
237 * allocated buffer, using the method described in <b>method</b>. Store the
238 * compressed string in *<b>out</b>, and its length in *<b>out_len</b>.
239 * Return 0 on success, -1 on failure.
242 tor_compress(char **out, size_t *out_len,
243 const char *in, size_t in_len,
244 compress_method_t method)
246 return tor_compress_impl(1, out, out_len, in, in_len, method,
247 BEST_COMPRESSION,
248 1, LOG_WARN);
251 /** Given zero or more compressed strings of total length <b>in_len</b> bytes
252 * at <b>in</b>, uncompress them into a newly allocated buffer, using the
253 * method described in <b>method</b>. Store the uncompressed string in
254 * *<b>out</b>, and its length in *<b>out_len</b>. Return 0 on success, -1 on
255 * failure.
257 * If any bytes are written to <b>out</b>, an extra byte NUL is always
258 * written at the end, but not counted in <b>out_len</b>. This is a
259 * safety feature to ensure that the output can be treated as a
260 * NUL-terminated string -- though of course, callers should check
261 * out_len anyway.
263 * If <b>complete_only</b> is true, we consider a truncated input as a
264 * failure; otherwise we decompress as much as we can. Warn about truncated
265 * or corrupt inputs at <b>protocol_warn_level</b>.
268 tor_uncompress(char **out, size_t *out_len,
269 const char *in, size_t in_len,
270 compress_method_t method,
271 int complete_only,
272 int protocol_warn_level)
274 return tor_compress_impl(0, out, out_len, in, in_len, method,
275 BEST_COMPRESSION,
276 complete_only, protocol_warn_level);
279 /** Try to tell whether the <b>in_len</b>-byte string in <b>in</b> is likely
280 * to be compressed or not. If it is, return the likeliest compression method.
281 * Otherwise, return UNKNOWN_METHOD.
283 compress_method_t
284 detect_compression_method(const char *in, size_t in_len)
286 if (in_len > 2 && fast_memeq(in, "\x1f\x8b", 2)) {
287 return GZIP_METHOD;
288 } else if (in_len > 2 && (in[0] & 0x0f) == 8 &&
289 (tor_ntohs(get_uint16(in)) % 31) == 0) {
290 return ZLIB_METHOD;
291 } else if (in_len > 2 &&
292 fast_memeq(in, "\x5d\x00\x00", 3)) {
293 return LZMA_METHOD;
294 } else if (in_len > 3 &&
295 fast_memeq(in, "\x28\xb5\x2f\xfd", 4)) {
296 return ZSTD_METHOD;
297 } else {
298 return UNKNOWN_METHOD;
302 /** Return 1 if a given <b>method</b> is supported; otherwise 0. */
304 tor_compress_supports_method(compress_method_t method)
306 switch (method) {
307 case GZIP_METHOD:
308 case ZLIB_METHOD:
309 return tor_zlib_method_supported();
310 case LZMA_METHOD:
311 return tor_lzma_method_supported();
312 case ZSTD_METHOD:
313 return tor_zstd_method_supported();
314 case NO_METHOD:
315 return 1;
316 case UNKNOWN_METHOD:
317 default:
318 return 0;
323 * Return a bitmask of the supported compression types, where 1&lt;&lt;m is
324 * set in the bitmask if and only if compression with method <b>m</b> is
325 * supported.
327 unsigned
328 tor_compress_get_supported_method_bitmask(void)
330 static unsigned supported = 0;
331 if (supported == 0) {
332 compress_method_t m;
333 for (m = NO_METHOD; m <= UNKNOWN_METHOD; ++m) {
334 if (tor_compress_supports_method(m)) {
335 supported |= (1u << m);
339 return supported;
342 /** Table of compression method names. These should have an "x-" prefix,
343 * if they are not listed in the IANA content coding registry. */
344 static const struct {
345 const char *name;
346 compress_method_t method;
347 } compression_method_names[] = {
348 { "gzip", GZIP_METHOD },
349 { "deflate", ZLIB_METHOD },
350 // We call this "x-tor-lzma" rather than "x-lzma", because we impose a
351 // lower maximum memory usage on the decoding side.
352 { "x-tor-lzma", LZMA_METHOD },
353 { "x-zstd" , ZSTD_METHOD },
354 { "identity", NO_METHOD },
356 /* Later entries in this table are not canonical; these are recognized but
357 * not emitted. */
358 { "x-gzip", GZIP_METHOD },
361 /** Return the canonical string representation of the compression method
362 * <b>method</b>, or NULL if the method isn't recognized. */
363 const char *
364 compression_method_get_name(compress_method_t method)
366 unsigned i;
367 for (i = 0; i < ARRAY_LENGTH(compression_method_names); ++i) {
368 if (method == compression_method_names[i].method)
369 return compression_method_names[i].name;
371 return NULL;
374 /** Table of compression human readable method names. */
375 static const struct {
376 compress_method_t method;
377 const char *name;
378 } compression_method_human_names[] = {
379 { NO_METHOD, "uncompressed" },
380 { GZIP_METHOD, "gzipped" },
381 { ZLIB_METHOD, "deflated" },
382 { LZMA_METHOD, "LZMA compressed" },
383 { ZSTD_METHOD, "Zstandard compressed" },
384 { UNKNOWN_METHOD, "unknown encoding" },
387 /** Return a human readable string representation of the compression method
388 * <b>method</b>, or NULL if the method isn't recognized. */
389 const char *
390 compression_method_get_human_name(compress_method_t method)
392 unsigned i;
393 for (i = 0; i < ARRAY_LENGTH(compression_method_human_names); ++i) {
394 if (method == compression_method_human_names[i].method)
395 return compression_method_human_names[i].name;
397 return NULL;
400 /** Return the compression method represented by the string <b>name</b>, or
401 * UNKNOWN_METHOD if the string isn't recognized. */
402 compress_method_t
403 compression_method_get_by_name(const char *name)
405 unsigned i;
406 for (i = 0; i < ARRAY_LENGTH(compression_method_names); ++i) {
407 if (!strcmp(compression_method_names[i].name, name))
408 return compression_method_names[i].method;
410 return UNKNOWN_METHOD;
413 /** Return a string representation of the version of the library providing the
414 * compression method given in <b>method</b>. Returns NULL if <b>method</b> is
415 * unknown or unsupported. */
416 const char *
417 tor_compress_version_str(compress_method_t method)
419 switch (method) {
420 case GZIP_METHOD:
421 case ZLIB_METHOD:
422 return tor_zlib_get_version_str();
423 case LZMA_METHOD:
424 return tor_lzma_get_version_str();
425 case ZSTD_METHOD:
426 return tor_zstd_get_version_str();
427 case NO_METHOD:
428 case UNKNOWN_METHOD:
429 default:
430 return NULL;
434 /** Return a string representation of the version of the library, found at
435 * compile time, providing the compression method given in <b>method</b>.
436 * Returns NULL if <b>method</b> is unknown or unsupported. */
437 const char *
438 tor_compress_header_version_str(compress_method_t method)
440 switch (method) {
441 case GZIP_METHOD:
442 case ZLIB_METHOD:
443 return tor_zlib_get_header_version_str();
444 case LZMA_METHOD:
445 return tor_lzma_get_header_version_str();
446 case ZSTD_METHOD:
447 return tor_zstd_get_header_version_str();
448 case NO_METHOD:
449 case UNKNOWN_METHOD:
450 default:
451 return NULL;
455 /** Return the approximate number of bytes allocated for all
456 * supported compression schemas. */
457 size_t
458 tor_compress_get_total_allocation(void)
460 return atomic_counter_get(&total_compress_allocation) +
461 tor_zlib_get_total_allocation() +
462 tor_lzma_get_total_allocation() +
463 tor_zstd_get_total_allocation();
466 /** Internal state for an incremental compression/decompression. The body of
467 * this struct is not exposed. */
468 struct tor_compress_state_t {
469 compress_method_t method; /**< The compression method. */
471 union {
472 tor_zlib_compress_state_t *zlib_state;
473 tor_lzma_compress_state_t *lzma_state;
474 tor_zstd_compress_state_t *zstd_state;
475 } u; /**< Compression backend state. */
478 /** Construct and return a tor_compress_state_t object using <b>method</b>. If
479 * <b>compress</b>, it's for compression; otherwise it's for decompression. */
480 tor_compress_state_t *
481 tor_compress_new(int compress, compress_method_t method,
482 compression_level_t compression_level)
484 tor_compress_state_t *state;
486 state = tor_malloc_zero(sizeof(tor_compress_state_t));
487 state->method = method;
489 switch (method) {
490 case GZIP_METHOD:
491 case ZLIB_METHOD: {
492 tor_zlib_compress_state_t *zlib_state =
493 tor_zlib_compress_new(compress, method, compression_level);
495 if (zlib_state == NULL)
496 goto err;
498 state->u.zlib_state = zlib_state;
499 break;
501 case LZMA_METHOD: {
502 tor_lzma_compress_state_t *lzma_state =
503 tor_lzma_compress_new(compress, method, compression_level);
505 if (lzma_state == NULL)
506 goto err;
508 state->u.lzma_state = lzma_state;
509 break;
511 case ZSTD_METHOD: {
512 tor_zstd_compress_state_t *zstd_state =
513 tor_zstd_compress_new(compress, method, compression_level);
515 if (zstd_state == NULL)
516 goto err;
518 state->u.zstd_state = zstd_state;
519 break;
521 case NO_METHOD: {
522 break;
524 case UNKNOWN_METHOD:
525 goto err;
528 atomic_counter_add(&total_compress_allocation,
529 sizeof(tor_compress_state_t));
530 return state;
532 err:
533 tor_free(state);
534 return NULL;
537 /** Compress/decompress some bytes using <b>state</b>. Read up to
538 * *<b>in_len</b> bytes from *<b>in</b>, and write up to *<b>out_len</b> bytes
539 * to *<b>out</b>, adjusting the values as we go. If <b>finish</b> is true,
540 * we've reached the end of the input.
542 * Return TOR_COMPRESS_DONE if we've finished the entire
543 * compression/decompression.
544 * Return TOR_COMPRESS_OK if we're processed everything from the input.
545 * Return TOR_COMPRESS_BUFFER_FULL if we're out of space on <b>out</b>.
546 * Return TOR_COMPRESS_ERROR if the stream is corrupt.
548 tor_compress_output_t
549 tor_compress_process(tor_compress_state_t *state,
550 char **out, size_t *out_len,
551 const char **in, size_t *in_len,
552 int finish)
554 tor_assert(state != NULL);
555 const size_t in_len_orig = *in_len;
556 const size_t out_len_orig = *out_len;
557 tor_compress_output_t rv;
559 if (*out_len == 0 && (*in_len > 0 || finish)) {
560 // If we still have input data, but no space for output data, we might as
561 // well return early and let the caller do the reallocation of the out
562 // variable.
563 return TOR_COMPRESS_BUFFER_FULL;
566 switch (state->method) {
567 case GZIP_METHOD:
568 case ZLIB_METHOD:
569 rv = tor_zlib_compress_process(state->u.zlib_state,
570 out, out_len, in, in_len,
571 finish);
572 break;
573 case LZMA_METHOD:
574 rv = tor_lzma_compress_process(state->u.lzma_state,
575 out, out_len, in, in_len,
576 finish);
577 break;
578 case ZSTD_METHOD:
579 rv = tor_zstd_compress_process(state->u.zstd_state,
580 out, out_len, in, in_len,
581 finish);
582 break;
583 case NO_METHOD:
584 rv = tor_cnone_compress_process(out, out_len, in, in_len,
585 finish);
586 break;
587 default:
588 case UNKNOWN_METHOD:
589 goto err;
591 if (BUG((rv == TOR_COMPRESS_OK) &&
592 *in_len == in_len_orig &&
593 *out_len == out_len_orig)) {
594 log_warn(LD_GENERAL,
595 "More info on the bug: method == %s, finish == %d, "
596 " *in_len == in_len_orig == %lu, "
597 "*out_len == out_len_orig == %lu",
598 compression_method_get_human_name(state->method), finish,
599 (unsigned long)in_len_orig, (unsigned long)out_len_orig);
600 return TOR_COMPRESS_ERROR;
603 return rv;
604 err:
605 return TOR_COMPRESS_ERROR;
608 /** Deallocate <b>state</b>. */
609 void
610 tor_compress_free_(tor_compress_state_t *state)
612 if (state == NULL)
613 return;
615 switch (state->method) {
616 case GZIP_METHOD:
617 case ZLIB_METHOD:
618 tor_zlib_compress_free(state->u.zlib_state);
619 break;
620 case LZMA_METHOD:
621 tor_lzma_compress_free(state->u.lzma_state);
622 break;
623 case ZSTD_METHOD:
624 tor_zstd_compress_free(state->u.zstd_state);
625 break;
626 case NO_METHOD:
627 break;
628 case UNKNOWN_METHOD:
629 break;
632 atomic_counter_sub(&total_compress_allocation,
633 sizeof(tor_compress_state_t));
634 tor_free(state);
637 /** Return the approximate number of bytes allocated for <b>state</b>. */
638 size_t
639 tor_compress_state_size(const tor_compress_state_t *state)
641 tor_assert(state != NULL);
643 size_t size = sizeof(tor_compress_state_t);
645 switch (state->method) {
646 case GZIP_METHOD:
647 case ZLIB_METHOD:
648 size += tor_zlib_compress_state_size(state->u.zlib_state);
649 break;
650 case LZMA_METHOD:
651 size += tor_lzma_compress_state_size(state->u.lzma_state);
652 break;
653 case ZSTD_METHOD:
654 size += tor_zstd_compress_state_size(state->u.zstd_state);
655 break;
656 case NO_METHOD:
657 case UNKNOWN_METHOD:
658 break;
661 return size;
664 /** Initialize all compression modules. */
666 tor_compress_init(void)
668 atomic_counter_init(&total_compress_allocation);
670 tor_zlib_init();
671 tor_lzma_init();
672 tor_zstd_init();
674 return 0;
677 /** Warn if we had any problems while setting up our compression libraries.
679 * (This isn't part of tor_compress_init, since the logs aren't set up yet.)
681 void
682 tor_compress_log_init_warnings(void)
684 // XXXX can we move this into tor_compress_init() after all? log.c queues
685 // XXXX log messages at startup.
686 tor_zstd_warn_if_version_mismatched();
689 static int
690 subsys_compress_initialize(void)
692 return tor_compress_init();
695 const subsys_fns_t sys_compress = {
696 .name = "compress",
697 SUBSYS_DECLARE_LOCATION(),
698 .supported = true,
699 .level = -55,
700 .initialize = subsys_compress_initialize,