1 /* gznorm.c -- normalize a gzip stream
2 * Copyright (C) 2018 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 * Version 1.0 7 Oct 2018 Mark Adler */
6 // gznorm takes a gzip stream, potentially containing multiple members, and
7 // converts it to a gzip stream with a single member. In addition the gzip
8 // header is normalized, removing the file name and time stamp, and setting the
9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress
10 // the data, so it is fast, but no advantage is gained from the history that
11 // could be available across member boundaries.
13 #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
14 // vsnprintf, stdout, stderr, NULL, FILE
15 #include <stdlib.h> // malloc, free
16 #include <string.h> // strerror
17 #include <errno.h> // errno
18 #include <stdarg.h> // va_list, va_start, va_end
19 #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
20 // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
21 // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
27 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
29 # define SET_BINARY_MODE(file)
34 // printf to an allocated string. Return the string, or NULL if the printf or
36 local
char *aprintf(char *fmt
, ...) {
37 // Get the length of the result of the printf.
40 int len
= vsnprintf(NULL
, 0, fmt
, args
);
45 // Allocate the required space and printf to it.
46 char *str
= malloc(len
+ 1);
50 vsnprintf(str
, len
+ 1, fmt
, args
);
55 // Return with an error, putting an allocated error message in *err. Doing an
56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
61 *err = aprintf(__VA_ARGS__); \
65 // Chunk size for buffered reads and for decompression. Twice this many bytes
66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
69 // Read a gzip stream from in and write an equivalent normalized gzip stream to
70 // out. If given no input, an empty gzip stream will be written. If successful,
71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
72 // details of the error are returned in *err, a pointer to an allocated string.
74 // The input may be a stream with multiple gzip members, which is converted to
75 // a single gzip member on the output. Each gzip member is decompressed at the
76 // level of deflate blocks. This enables clearing the last-block bit, shifting
77 // the compressed data to concatenate to the previous member's compressed data,
78 // which can end at an arbitrary bit boundary, and identifying stored blocks in
79 // order to resynchronize those to byte boundaries. The deflate compressed data
80 // is terminated with a 10-bit empty fixed block. If any members on the input
81 // end with a 10-bit empty fixed block, then that block is excised from the
82 // stream. This avoids appending empty fixed blocks for every normalization,
83 // and assures that gzip_normalize applied a second time will not change the
84 // input. The pad bits after stored block headers and after the final deflate
85 // block are all forced to zeros.
86 local
int gzip_normalize(FILE *in
, FILE *out
, char **err
) {
87 // initialize the inflate engine to process a gzip member
93 strm
.next_in
= Z_NULL
;
94 if (inflateInit2(&strm
, 15 + 16) != Z_OK
)
97 // State while processing the input gzip stream.
98 enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
99 BETWEEN
, // between gzip members (must end in this state)
100 HEAD
, // reading a gzip header
101 BLOCK
, // reading deflate blocks
102 TAIL
// reading a gzip trailer
103 } state
= BETWEEN
; // current component being processed
104 unsigned long crc
= 0; // accumulated CRC of uncompressed data
105 unsigned long len
= 0; // accumulated length of uncompressed data
106 unsigned long buf
= 0; // deflate stream bit buffer of num bits
107 int num
= 0; // number of bits in buf (at bottom)
109 // Write a canonical gzip header (no mod time, file name, comment, extra
110 // block, or extra flags, and OS is marked as unknown).
111 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out
);
113 // Process the gzip stream from in until reaching the end of the input,
114 // encountering invalid input, or experiencing an i/o error.
115 int more
; // true if not at the end of the input
117 // State inside this loop.
118 unsigned char *put
; // next input buffer location to process
119 int prev
; // number of bits from previous block in
120 // the bit buffer, or -1 if not at the
122 unsigned long long memb
; // uncompressed length of member
123 size_t tail
; // number of trailer bytes read (0..8)
124 unsigned long part
; // accumulated trailer component
126 // Get the next chunk of input from in.
127 unsigned char dat
[CHUNK
];
128 strm
.avail_in
= fread(dat
, 1, CHUNK
, in
);
129 if (strm
.avail_in
== 0)
131 more
= strm
.avail_in
== CHUNK
;
132 strm
.next_in
= put
= dat
;
134 // Run that chunk of input through the inflate engine to exhaustion.
136 // At this point it is assured that strm.avail_in > 0.
138 // Inflate until the end of a gzip component (header, deflate
139 // block, trailer) is reached, or until all of the chunk is
140 // consumed. The resulting decompressed data is discarded, though
141 // the total size of the decompressed data in each member is
142 // tracked, for the calculation of the total CRC.
144 // inflate and handle any errors
145 unsigned char scrap
[CHUNK
];
146 strm
.avail_out
= CHUNK
;
147 strm
.next_out
= scrap
;
148 int ret
= inflate(&strm
, Z_BLOCK
);
149 if (ret
== Z_MEM_ERROR
)
150 BYE("out of memory");
151 if (ret
== Z_DATA_ERROR
)
152 BYE("input invalid: %s", strm
.msg
);
153 if (ret
!= Z_OK
&& ret
!= Z_BUF_ERROR
&& ret
!= Z_STREAM_END
)
154 BYE("internal error");
156 // Update the number of uncompressed bytes generated in this
157 // member. The actual count (not modulo 2^32) is required to
158 // correctly compute the total CRC.
159 unsigned got
= CHUNK
- strm
.avail_out
;
162 BYE("overflow error");
164 // Continue to process this chunk until it is consumed, or
165 // until the end of a component (header, deflate block, or
166 // trailer) is reached.
167 } while (strm
.avail_out
== 0 && (strm
.data_type
& 0x80) == 0);
169 // Since strm.avail_in was > 0 for the inflate call, some input was
170 // just consumed. It is therefore assured that put < strm.next_in.
172 // Disposition the consumed component or part of a component.
176 // Fall through to HEAD when some or all of the header is
180 // Discard the header.
181 if (strm
.data_type
& 0x80) {
182 // End of header reached -- deflate blocks follow.
191 // Copy the deflate stream to the output, but with the
192 // last-block-bit cleared. Re-synchronize stored block
193 // headers to the output byte boundaries. The bytes at
194 // put..strm.next_in-1 is the compressed data that has been
195 // processed and is ready to be copied to the output.
197 // At this point, it is assured that new compressed data is
198 // available, i.e., put < strm.next_in. If prev is -1, then
199 // that compressed data starts in the middle of a deflate
200 // block. If prev is not -1, then the bits in the bit
201 // buffer, possibly combined with the bits in *put, contain
202 // the three-bit header of the new deflate block. In that
203 // case, prev is the number of bits from the previous block
204 // that remain in the bit buffer. Since num is the number
205 // of bits in the bit buffer, we have that num - prev is
206 // the number of bits from the new block currently in the
209 // If strm.data_type & 0xc0 is 0x80, then the last byte of
210 // the available compressed data includes the last bits of
211 // the end of a deflate block. In that case, that last byte
212 // also has strm.data_type & 0x1f bits of the next deflate
213 // block, in the range 0..7. If strm.data_type & 0xc0 is
214 // 0xc0, then the last byte of the compressed data is the
215 // end of the deflate stream, followed by strm.data_type &
216 // 0x1f pad bits, also in the range 0..7.
218 // Set bits to the number of bits not yet consumed from the
219 // last byte. If we are at the end of the block, bits is
220 // either the number of bits in the last byte belonging to
221 // the next block, or the number of pad bits after the
222 // final block. In either of those cases, bits is in the
224 ; // (required due to C syntax oddity)
225 int bits
= strm
.data_type
& 0x1f;
228 // We are at the start of a new block. Clear the last
229 // block bit, and check for special cases. If it is a
230 // stored block, then emit the header and pad to the
231 // next byte boundary. If it is a final, empty fixed
232 // block, then excise it.
234 // Some or all of the three header bits for this block
235 // may already be in the bit buffer. Load any remaining
236 // header bits into the bit buffer.
237 if (num
- prev
< 3) {
238 buf
+= (unsigned long)*put
++ << num
;
242 // Set last to have a 1 in the position of the last
243 // block bit in the bit buffer.
244 unsigned long last
= (unsigned long)1 << prev
;
246 if (((buf
>> prev
) & 7) == 3) {
247 // This is a final fixed block. Load at least ten
248 // bits from this block, including the header, into
249 // the bit buffer. We already have at least three,
250 // so at most one more byte needs to be loaded.
251 if (num
- prev
< 10) {
252 if (put
== strm
.next_in
)
253 // Need to go get and process more input.
254 // We'll end up back here to finish this.
256 buf
+= (unsigned long)*put
++ << num
;
259 if (((buf
>> prev
) & 0x3ff) == 3) {
260 // That final fixed block is empty. Delete it
261 // to avoid adding an empty block every time a
262 // gzip stream is normalized.
264 buf
&= last
- 1; // zero the pad bits
267 else if (((buf
>> prev
) & 6) == 0) {
268 // This is a stored block. Flush to the next
269 // byte boundary after the three-bit header.
270 num
= (prev
+ 10) & ~7;
271 buf
&= last
- 1; // zero the pad bits
274 // Clear the last block bit.
277 // Write out complete bytes in the bit buffer.
284 // If no more bytes left to process, then we have
285 // consumed the byte that had bits from the next block.
286 if (put
== strm
.next_in
)
290 // We are done handling the deflate block header. Now copy
291 // all or almost all of the remaining compressed data that
292 // has been processed so far. Don't copy one byte at the
293 // end if it contains bits from the next deflate block or
294 // pad bits at the end of a deflate block.
296 // mix is 1 if we are at the end of a deflate block, and if
297 // some of the bits in the last byte follow this block. mix
298 // is 0 if we are in the middle of a deflate block, if the
299 // deflate block ended on a byte boundary, or if all of the
300 // compressed data processed so far has been consumed.
301 int mix
= (strm
.data_type
& 0x80) && bits
;
303 // Copy all of the processed compressed data to the output,
304 // except for the last byte if it contains bits from the
305 // next deflate block or pad bits at the end of the deflate
306 // stream. Copy the data after shifting in num bits from
307 // buf in front of it, leaving num bits from the end of the
308 // compressed data in buf when done.
309 unsigned char *end
= strm
.next_in
- mix
;
312 // Insert num bits from buf before the data being
315 buf
+= (unsigned)(*put
++) << num
;
320 // No shifting needed -- write directly.
321 fwrite(put
, 1, end
- put
, out
);
326 // Process the last processed byte if it wasn't written.
328 // Load the last byte into the bit buffer.
329 buf
+= (unsigned)(*put
++) << num
;
332 if (strm
.data_type
& 0x40) {
333 // We are at the end of the deflate stream and
334 // there are bits pad bits. Discard the pad bits
335 // and write a byte to the output, if available.
336 // Leave the num bits left over in buf to prepend
337 // to the next deflate stream.
345 // Force the pad bits in the bit buffer to zeros.
346 buf
&= ((unsigned long)1 << num
) - 1;
348 // Don't need to set prev here since going to TAIL.
351 // At the end of an internal deflate block. Leave
352 // the last byte in the bit buffer to examine on
353 // the next entry to BLOCK, when more bits from the
354 // next block will be available.
355 prev
= num
- bits
; // number of bits in buffer
356 // from current block
359 // Don't have a byte left over, so we are in the middle of
360 // a deflate block, or the deflate block ended on a byte
361 // boundary. Set prev appropriately for the next entry into
363 else if (strm
.data_type
& 0x80)
364 // The block ended on a byte boundary, so no header
365 // bits are in the bit buffer.
368 // In the middle of a deflate block, so no header here.
371 // Check for the end of the deflate stream.
372 if ((strm
.data_type
& 0xc0) == 0xc0) {
373 // That ends the deflate stream on the input side, the
374 // pad bits were discarded, and any remaining bits from
375 // the last block in the stream are saved in the bit
376 // buffer to prepend to the next stream. Process the
377 // gzip trailer next.
385 // Accumulate available trailer bytes to update the total
386 // CRC and the total uncompressed length.
388 part
= (part
>> 8) + ((unsigned long)(*put
++) << 24);
391 // Update the total CRC.
393 if (len2
< 0 || (unsigned long long)len2
!= memb
)
394 BYE("overflow error");
395 crc
= crc
? crc32_combine(crc
, part
, len2
) : part
;
398 else if (tail
== 8) {
399 // Update the total uncompressed length. (It's ok
400 // if this sum is done modulo 2^32.)
403 // At the end of a member. Set up to inflate an
404 // immediately following gzip member. (If we made
405 // it this far, then the trailer was valid.)
406 if (inflateReset(&strm
) != Z_OK
)
407 BYE("internal error");
411 } while (put
< strm
.next_in
);
415 // Process the input buffer until completely consumed.
416 } while (strm
.avail_in
> 0);
418 // Process input until end of file, invalid input, or i/o error.
421 // Done with the inflate engine.
424 // Verify the validity of the input.
425 if (state
!= BETWEEN
)
426 BYE("input invalid: incomplete gzip stream");
428 // Write the remaining deflate stream bits, followed by a terminating
429 // deflate fixed block.
430 buf
+= (unsigned long)3 << num
;
436 // Write the gzip trailer, which is the CRC and the uncompressed length
437 // modulo 2^32, both in little-endian order.
440 putc(crc
>> 16, out
);
441 putc(crc
>> 24, out
);
444 putc(len
>> 16, out
);
445 putc(len
>> 24, out
);
448 // Check for any i/o errors.
449 if (ferror(in
) || ferror(out
))
450 BYE("i/o error: %s", strerror(errno
));
457 // Normalize the gzip stream on stdin, writing the result to stdout.
459 // Avoid end-of-line conversions on evil operating systems.
460 SET_BINARY_MODE(stdin
);
461 SET_BINARY_MODE(stdout
);
463 // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
465 int ret
= gzip_normalize(stdin
, stdout
, &err
);
467 fprintf(stderr
, "gznorm error: %s\n", err
);