1 /* Copyright (c) 2004, Roger Dingledine.
2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3 * Copyright (c) 2007-2021, The Tor Project, Inc. */
4 /* See LICENSE for licensing information */
8 * \brief Common compression API implementation.
10 * This file provides a unified interface to all the compression libraries Tor
19 #include "lib/cc/torint.h"
21 #ifdef HAVE_NETINET_IN_H
22 #include <netinet/in.h>
25 #include "lib/log/log.h"
26 #include "lib/log/util_bug.h"
27 #include "lib/arch/bytes.h"
28 #include "lib/ctime/di_ops.h"
29 #include "lib/compress/compress.h"
30 #include "lib/compress/compress_lzma.h"
31 #include "lib/compress/compress_none.h"
32 #include "lib/compress/compress_sys.h"
33 #include "lib/compress/compress_zlib.h"
34 #include "lib/compress/compress_zstd.h"
35 #include "lib/intmath/cmp.h"
36 #include "lib/malloc/malloc.h"
37 #include "lib/subsys/subsys.h"
38 #include "lib/thread/threads.h"
40 /** Total number of bytes allocated for compression state overhead. */
41 static atomic_counter_t total_compress_allocation
;
44 /* These macros define the maximum allowable compression factor. Anything of
45 * size greater than CHECK_FOR_COMPRESSION_BOMB_AFTER is not allowed to
46 * have an uncompression factor (uncompressed size:compressed size ratio) of
47 * any greater than MAX_UNCOMPRESSION_FACTOR.
49 * Picking a value for MAX_UNCOMPRESSION_FACTOR is a trade-off: we want it to
50 * be small to limit the attack multiplier, but we also want it to be large
51 * enough so that no legitimate document --even ones we might invent in the
52 * future -- ever compresses by a factor of greater than
53 * MAX_UNCOMPRESSION_FACTOR. Within those parameters, there's a reasonably
54 * large range of possible values. IMO, anything over 8 is probably safe; IMO
55 * anything under 50 is probably sufficient.
57 #define MAX_UNCOMPRESSION_FACTOR 25
58 #define CHECK_FOR_COMPRESSION_BOMB_AFTER (1024*64)
61 /** Return true if uncompressing an input of size <b>in_size</b> to an input of
62 * size at least <b>size_out</b> looks like a compression bomb. */
64 tor_compress_is_compression_bomb
,(size_t size_in
, size_t size_out
))
66 if (size_in
== 0 || size_out
< CHECK_FOR_COMPRESSION_BOMB_AFTER
)
69 return (size_out
/ size_in
> MAX_UNCOMPRESSION_FACTOR
);
72 /** Guess the size that <b>in_len</b> will be after compression or
75 guess_compress_size(int compress
, compress_method_t method
,
76 compression_level_t compression_level
,
79 // ignore these for now.
80 (void)compression_level
;
81 if (method
== NO_METHOD
) {
82 /* Guess that we'll need an extra byte, to avoid a needless realloc
83 * for nul-termination */
84 return (in_len
< SIZE_MAX
) ? in_len
+ 1 : in_len
;
87 /* Always guess a factor of 2. */
91 if (in_len
< SIZE_T_CEILING
/2)
94 return MAX(in_len
, 1024);
97 /** Internal function to implement tor_compress/tor_uncompress, depending on
98 * whether <b>compress</b> is set. All arguments are as for tor_compress or
101 tor_compress_impl(int compress
,
102 char **out
, size_t *out_len
,
103 const char *in
, size_t in_len
,
104 compress_method_t method
,
105 compression_level_t compression_level
,
107 int protocol_warn_level
)
109 tor_compress_state_t
*stream
;
112 stream
= tor_compress_new(compress
, method
, compression_level
);
114 if (stream
== NULL
) {
115 log_warn(LD_GENERAL
, "NULL stream while %scompressing",
117 log_debug(LD_GENERAL
, "method: %d level: %d at len: %lu",
118 method
, compression_level
, (unsigned long)in_len
);
122 size_t in_len_orig
= in_len
;
123 size_t out_remaining
, out_alloc
;
126 out_remaining
= out_alloc
=
127 guess_compress_size(compress
, method
, compression_level
, in_len
);
128 *out
= outptr
= tor_malloc(out_remaining
);
130 const int finish
= complete_only
|| compress
;
133 switch (tor_compress_process(stream
,
134 &outptr
, &out_remaining
,
135 &in
, &in_len
, finish
)) {
136 case TOR_COMPRESS_DONE
:
137 if (in_len
== 0 || compress
) {
140 // More data is present, and we're decompressing. So we may need to
141 // reinitialize the stream if we are handling multiple concatenated
143 tor_compress_free(stream
);
144 stream
= tor_compress_new(compress
, method
, compression_level
);
145 if (stream
== NULL
) {
146 log_warn(LD_GENERAL
, "NULL stream while %scompressing",
152 case TOR_COMPRESS_OK
:
153 if (compress
|| complete_only
) {
154 log_fn(protocol_warn_level
, LD_PROTOCOL
,
155 "Unexpected %s while %scompressing",
156 complete_only
?"end of input":"result",
158 log_debug(LD_GENERAL
, "method: %d level: %d at len: %lu",
159 method
, compression_level
, (unsigned long)in_len
);
167 case TOR_COMPRESS_BUFFER_FULL
: {
168 if (!compress
&& outptr
< *out
+out_alloc
) {
169 // A buffer error in this case means that we have a problem
171 log_fn(protocol_warn_level
, LD_PROTOCOL
,
172 "Possible truncated or corrupt compressed data");
175 if (out_alloc
>= SIZE_T_CEILING
/ 2) {
176 log_warn(LD_GENERAL
, "While %scompressing data: ran out of space.",
181 tor_compress_is_compression_bomb(in_len_orig
, out_alloc
)) {
182 // This should already have been caught down in the backend logic.
184 tor_assert_nonfatal_unreached();
188 const size_t offset
= outptr
- *out
;
190 *out
= tor_realloc(*out
, out_alloc
);
191 outptr
= *out
+ offset
;
192 out_remaining
= out_alloc
- offset
;
195 case TOR_COMPRESS_ERROR
:
196 log_fn(protocol_warn_level
, LD_GENERAL
,
197 "Error while %scompressing data: bad input?",
199 goto err
; // bad data.
203 tor_assert_nonfatal_unreached();
209 *out_len
= outptr
- *out
;
210 if (compress
&& tor_compress_is_compression_bomb(*out_len
, in_len_orig
)) {
211 log_warn(LD_BUG
, "We compressed something and got an insanely high "
212 "compression factor; other Tors would think this was a "
213 "compression bomb.");
217 // NUL-terminate our output.
218 if (out_alloc
== *out_len
)
219 *out
= tor_realloc(*out
, out_alloc
+ 1);
220 (*out
)[*out_len
] = '\0';
232 tor_compress_free(stream
);
236 /** Given <b>in_len</b> bytes at <b>in</b>, compress them into a newly
237 * allocated buffer, using the method described in <b>method</b>. Store the
238 * compressed string in *<b>out</b>, and its length in *<b>out_len</b>.
239 * Return 0 on success, -1 on failure.
242 tor_compress(char **out
, size_t *out_len
,
243 const char *in
, size_t in_len
,
244 compress_method_t method
)
246 return tor_compress_impl(1, out
, out_len
, in
, in_len
, method
,
251 /** Given zero or more compressed strings of total length <b>in_len</b> bytes
252 * at <b>in</b>, uncompress them into a newly allocated buffer, using the
253 * method described in <b>method</b>. Store the uncompressed string in
254 * *<b>out</b>, and its length in *<b>out_len</b>. Return 0 on success, -1 on
257 * If any bytes are written to <b>out</b>, an extra byte NUL is always
258 * written at the end, but not counted in <b>out_len</b>. This is a
259 * safety feature to ensure that the output can be treated as a
260 * NUL-terminated string -- though of course, callers should check
263 * If <b>complete_only</b> is true, we consider a truncated input as a
264 * failure; otherwise we decompress as much as we can. Warn about truncated
265 * or corrupt inputs at <b>protocol_warn_level</b>.
268 tor_uncompress(char **out
, size_t *out_len
,
269 const char *in
, size_t in_len
,
270 compress_method_t method
,
272 int protocol_warn_level
)
274 return tor_compress_impl(0, out
, out_len
, in
, in_len
, method
,
276 complete_only
, protocol_warn_level
);
279 /** Try to tell whether the <b>in_len</b>-byte string in <b>in</b> is likely
280 * to be compressed or not. If it is, return the likeliest compression method.
281 * Otherwise, return UNKNOWN_METHOD.
284 detect_compression_method(const char *in
, size_t in_len
)
286 if (in_len
> 2 && fast_memeq(in
, "\x1f\x8b", 2)) {
288 } else if (in_len
> 2 && (in
[0] & 0x0f) == 8 &&
289 (tor_ntohs(get_uint16(in
)) % 31) == 0) {
291 } else if (in_len
> 2 &&
292 fast_memeq(in
, "\x5d\x00\x00", 3)) {
294 } else if (in_len
> 3 &&
295 fast_memeq(in
, "\x28\xb5\x2f\xfd", 4)) {
298 return UNKNOWN_METHOD
;
302 /** Return 1 if a given <b>method</b> is supported; otherwise 0. */
304 tor_compress_supports_method(compress_method_t method
)
309 return tor_zlib_method_supported();
311 return tor_lzma_method_supported();
313 return tor_zstd_method_supported();
323 * Return a bitmask of the supported compression types, where 1<<m is
324 * set in the bitmask if and only if compression with method <b>m</b> is
328 tor_compress_get_supported_method_bitmask(void)
330 static unsigned supported
= 0;
331 if (supported
== 0) {
333 for (m
= NO_METHOD
; m
<= UNKNOWN_METHOD
; ++m
) {
334 if (tor_compress_supports_method(m
)) {
335 supported
|= (1u << m
);
342 /** Table of compression method names. These should have an "x-" prefix,
343 * if they are not listed in the IANA content coding registry. */
344 static const struct {
346 compress_method_t method
;
347 } compression_method_names
[] = {
348 { "gzip", GZIP_METHOD
},
349 { "deflate", ZLIB_METHOD
},
350 // We call this "x-tor-lzma" rather than "x-lzma", because we impose a
351 // lower maximum memory usage on the decoding side.
352 { "x-tor-lzma", LZMA_METHOD
},
353 { "x-zstd" , ZSTD_METHOD
},
354 { "identity", NO_METHOD
},
356 /* Later entries in this table are not canonical; these are recognized but
358 { "x-gzip", GZIP_METHOD
},
361 /** Return the canonical string representation of the compression method
362 * <b>method</b>, or NULL if the method isn't recognized. */
364 compression_method_get_name(compress_method_t method
)
367 for (i
= 0; i
< ARRAY_LENGTH(compression_method_names
); ++i
) {
368 if (method
== compression_method_names
[i
].method
)
369 return compression_method_names
[i
].name
;
374 /** Table of compression human readable method names. */
375 static const struct {
376 compress_method_t method
;
378 } compression_method_human_names
[] = {
379 { NO_METHOD
, "uncompressed" },
380 { GZIP_METHOD
, "gzipped" },
381 { ZLIB_METHOD
, "deflated" },
382 { LZMA_METHOD
, "LZMA compressed" },
383 { ZSTD_METHOD
, "Zstandard compressed" },
384 { UNKNOWN_METHOD
, "unknown encoding" },
387 /** Return a human readable string representation of the compression method
388 * <b>method</b>, or NULL if the method isn't recognized. */
390 compression_method_get_human_name(compress_method_t method
)
393 for (i
= 0; i
< ARRAY_LENGTH(compression_method_human_names
); ++i
) {
394 if (method
== compression_method_human_names
[i
].method
)
395 return compression_method_human_names
[i
].name
;
400 /** Return the compression method represented by the string <b>name</b>, or
401 * UNKNOWN_METHOD if the string isn't recognized. */
403 compression_method_get_by_name(const char *name
)
406 for (i
= 0; i
< ARRAY_LENGTH(compression_method_names
); ++i
) {
407 if (!strcmp(compression_method_names
[i
].name
, name
))
408 return compression_method_names
[i
].method
;
410 return UNKNOWN_METHOD
;
413 /** Return a string representation of the version of the library providing the
414 * compression method given in <b>method</b>. Returns NULL if <b>method</b> is
415 * unknown or unsupported. */
417 tor_compress_version_str(compress_method_t method
)
422 return tor_zlib_get_version_str();
424 return tor_lzma_get_version_str();
426 return tor_zstd_get_version_str();
434 /** Return a string representation of the version of the library, found at
435 * compile time, providing the compression method given in <b>method</b>.
436 * Returns NULL if <b>method</b> is unknown or unsupported. */
438 tor_compress_header_version_str(compress_method_t method
)
443 return tor_zlib_get_header_version_str();
445 return tor_lzma_get_header_version_str();
447 return tor_zstd_get_header_version_str();
455 /** Return the approximate number of bytes allocated for all
456 * supported compression schemas. */
458 tor_compress_get_total_allocation(void)
460 return atomic_counter_get(&total_compress_allocation
) +
461 tor_zlib_get_total_allocation() +
462 tor_lzma_get_total_allocation() +
463 tor_zstd_get_total_allocation();
466 /** Internal state for an incremental compression/decompression. The body of
467 * this struct is not exposed. */
468 struct tor_compress_state_t
{
469 compress_method_t method
; /**< The compression method. */
472 tor_zlib_compress_state_t
*zlib_state
;
473 tor_lzma_compress_state_t
*lzma_state
;
474 tor_zstd_compress_state_t
*zstd_state
;
475 } u
; /**< Compression backend state. */
478 /** Construct and return a tor_compress_state_t object using <b>method</b>. If
479 * <b>compress</b>, it's for compression; otherwise it's for decompression. */
480 tor_compress_state_t
*
481 tor_compress_new(int compress
, compress_method_t method
,
482 compression_level_t compression_level
)
484 tor_compress_state_t
*state
;
486 state
= tor_malloc_zero(sizeof(tor_compress_state_t
));
487 state
->method
= method
;
492 tor_zlib_compress_state_t
*zlib_state
=
493 tor_zlib_compress_new(compress
, method
, compression_level
);
495 if (zlib_state
== NULL
)
498 state
->u
.zlib_state
= zlib_state
;
502 tor_lzma_compress_state_t
*lzma_state
=
503 tor_lzma_compress_new(compress
, method
, compression_level
);
505 if (lzma_state
== NULL
)
508 state
->u
.lzma_state
= lzma_state
;
512 tor_zstd_compress_state_t
*zstd_state
=
513 tor_zstd_compress_new(compress
, method
, compression_level
);
515 if (zstd_state
== NULL
)
518 state
->u
.zstd_state
= zstd_state
;
528 atomic_counter_add(&total_compress_allocation
,
529 sizeof(tor_compress_state_t
));
537 /** Compress/decompress some bytes using <b>state</b>. Read up to
538 * *<b>in_len</b> bytes from *<b>in</b>, and write up to *<b>out_len</b> bytes
539 * to *<b>out</b>, adjusting the values as we go. If <b>finish</b> is true,
540 * we've reached the end of the input.
542 * Return TOR_COMPRESS_DONE if we've finished the entire
543 * compression/decompression.
544 * Return TOR_COMPRESS_OK if we're processed everything from the input.
545 * Return TOR_COMPRESS_BUFFER_FULL if we're out of space on <b>out</b>.
546 * Return TOR_COMPRESS_ERROR if the stream is corrupt.
548 tor_compress_output_t
549 tor_compress_process(tor_compress_state_t
*state
,
550 char **out
, size_t *out_len
,
551 const char **in
, size_t *in_len
,
554 tor_assert(state
!= NULL
);
555 const size_t in_len_orig
= *in_len
;
556 const size_t out_len_orig
= *out_len
;
557 tor_compress_output_t rv
;
559 if (*out_len
== 0 && (*in_len
> 0 || finish
)) {
560 // If we still have input data, but no space for output data, we might as
561 // well return early and let the caller do the reallocation of the out
563 return TOR_COMPRESS_BUFFER_FULL
;
566 switch (state
->method
) {
569 rv
= tor_zlib_compress_process(state
->u
.zlib_state
,
570 out
, out_len
, in
, in_len
,
574 rv
= tor_lzma_compress_process(state
->u
.lzma_state
,
575 out
, out_len
, in
, in_len
,
579 rv
= tor_zstd_compress_process(state
->u
.zstd_state
,
580 out
, out_len
, in
, in_len
,
584 rv
= tor_cnone_compress_process(out
, out_len
, in
, in_len
,
591 if (BUG((rv
== TOR_COMPRESS_OK
) &&
592 *in_len
== in_len_orig
&&
593 *out_len
== out_len_orig
)) {
595 "More info on the bug: method == %s, finish == %d, "
596 " *in_len == in_len_orig == %lu, "
597 "*out_len == out_len_orig == %lu",
598 compression_method_get_human_name(state
->method
), finish
,
599 (unsigned long)in_len_orig
, (unsigned long)out_len_orig
);
600 return TOR_COMPRESS_ERROR
;
605 return TOR_COMPRESS_ERROR
;
608 /** Deallocate <b>state</b>. */
610 tor_compress_free_(tor_compress_state_t
*state
)
615 switch (state
->method
) {
618 tor_zlib_compress_free(state
->u
.zlib_state
);
621 tor_lzma_compress_free(state
->u
.lzma_state
);
624 tor_zstd_compress_free(state
->u
.zstd_state
);
632 atomic_counter_sub(&total_compress_allocation
,
633 sizeof(tor_compress_state_t
));
637 /** Return the approximate number of bytes allocated for <b>state</b>. */
639 tor_compress_state_size(const tor_compress_state_t
*state
)
641 tor_assert(state
!= NULL
);
643 size_t size
= sizeof(tor_compress_state_t
);
645 switch (state
->method
) {
648 size
+= tor_zlib_compress_state_size(state
->u
.zlib_state
);
651 size
+= tor_lzma_compress_state_size(state
->u
.lzma_state
);
654 size
+= tor_zstd_compress_state_size(state
->u
.zstd_state
);
664 /** Initialize all compression modules. */
666 tor_compress_init(void)
668 atomic_counter_init(&total_compress_allocation
);
677 /** Warn if we had any problems while setting up our compression libraries.
679 * (This isn't part of tor_compress_init, since the logs aren't set up yet.)
682 tor_compress_log_init_warnings(void)
684 // XXXX can we move this into tor_compress_init() after all? log.c queues
685 // XXXX log messages at startup.
686 tor_zstd_warn_if_version_mismatched();
690 subsys_compress_initialize(void)
692 return tor_compress_init();
695 const subsys_fns_t sys_compress
= {
697 SUBSYS_DECLARE_LOCATION(),
700 .initialize
= subsys_compress_initialize
,