filters/xz/xzfile.c

   1 /* nbdkit
   2  * Copyright Red Hat
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  * * Redistributions of source code must retain the above copyright
   9  * notice, this list of conditions and the following disclaimer.
  10  *
  11  * * Redistributions in binary form must reproduce the above copyright
  12  * notice, this list of conditions and the following disclaimer in the
  13  * documentation and/or other materials provided with the distribution.
  14  *
  15  * * Neither the name of Red Hat nor the names of its contributors may be
  16  * used to endorse or promote products derived from this software without
  17  * specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  22  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
  23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  26  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  27  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  28  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  29  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  */
  32
  33 /* liblzma is a complex interface, so abstract it here. */
  34
  35 #include <config.h>
  36
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include <stdbool.h>
  40 #include <string.h>
  41 #include <stdint.h>
  42 #include <inttypes.h>
  43 #include <unistd.h>
  44 #include <fcntl.h>
  45 #include <sys/types.h>
  46
  47 #include <nbdkit-filter.h>
  48
  49 #include <lzma.h>
  50
  51 #include "cleanup.h"
  52
  53 #include "xzfile.h"
  54
  55 #define XZ_HEADER_MAGIC     "\xfd" "7zXZ\0"
  56 #define XZ_HEADER_MAGIC_LEN 6
  57 #define XZ_FOOTER_MAGIC     "YZ"
  58 #define XZ_FOOTER_MAGIC_LEN 2
  59
  60 struct xzfile {
  61   lzma_index *idx;
  62   size_t nr_streams;
  63   size_t nr_blocks;
  64   uint64_t max_uncompressed_block_size;
  65 };
  66
  67 static bool check_header_magic (nbdkit_next *next);
  68 static lzma_index *parse_indexes (nbdkit_next *next, size_t *);
  69 static int iter_indexes (lzma_index *idx, size_t *, uint64_t *);
  70
  71 xzfile *
  72 xzfile_open (nbdkit_next *next)
  73 {
  74   xzfile *xz;
  75   uint64_t size;
  76
  77   xz = malloc (sizeof *xz);
  78   if (xz == NULL) {
  79     nbdkit_error ("malloc: %m");
  80     return NULL;
  81   }
  82
  83   /* Check file magic. */
  84   if (!check_header_magic (next)) {
  85     nbdkit_error ("xz: not an xz file");
  86     goto err1;
  87   }
  88
  89   /* Read and parse the indexes. */
  90   xz->idx = parse_indexes (next, &xz->nr_streams);
  91   if (xz->idx == NULL)
  92     goto err1;
  93
  94   /* Iterate over indexes to find the number of and largest block. */
  95   if (iter_indexes (xz->idx,
  96                     &xz->nr_blocks, &xz->max_uncompressed_block_size) == -1)
  97     goto err1;
  98
  99   size = lzma_index_uncompressed_size (xz->idx);
 100   nbdkit_debug ("xz: size %" PRIu64 " bytes (%.1fM)",
 101                 size, size / 1024.0 / 1024.0);
 102   nbdkit_debug ("xz: %zu streams, %zu blocks", xz->nr_streams, xz->nr_blocks);
 103   nbdkit_debug ("xz: maximum uncompressed block size %" PRIu64 " bytes (%.1fM)",
 104                 xz->max_uncompressed_block_size,
 105                 xz->max_uncompressed_block_size / 1024.0 / 1024.0);
 106
 107   return xz;
 108
 109  err1:
 110   free (xz);
 111   return NULL;
 112 }
 113
 114 static bool
 115 check_header_magic (nbdkit_next *next)
 116 {
 117   char buf[XZ_HEADER_MAGIC_LEN];
 118   int err;
 119
 120   if (next->get_size (next) < XZ_HEADER_MAGIC_LEN) {
 121     nbdkit_error ("xz: file too short");
 122     return false;
 123   }
 124   if (next->pread (next, buf, XZ_HEADER_MAGIC_LEN, 0, 0, &err) == -1) {
 125     nbdkit_error ("xz: could not read header magic: error %d", err);
 126     return false;
 127   }
 128   if (memcmp (buf, XZ_HEADER_MAGIC, XZ_HEADER_MAGIC_LEN) != 0)
 129     return false;
 130   return true;
 131 }
 132
 133 /* For explanation of this function, see src/xz/list.c:parse_indexes
 134  * in the xz sources.
 135  */
 136 static lzma_index *
 137 parse_indexes (nbdkit_next *next,
 138                size_t *nr_streams)
 139 {
 140   lzma_ret r;
 141   int64_t size, pos, index_size, offs;
 142   int err;
 143   uint8_t footer[LZMA_STREAM_HEADER_SIZE];
 144   uint8_t header[LZMA_STREAM_HEADER_SIZE];
 145   lzma_stream_flags footer_flags;
 146   lzma_stream_flags header_flags;
 147   lzma_stream strm = LZMA_STREAM_INIT;
 148   lzma_index *combined_index = NULL;
 149   lzma_index *this_index = NULL;
 150   lzma_vli stream_padding = 0;
 151
 152   *nr_streams = 0;
 153
 154   /* Check file size is a multiple of 4 bytes. */
 155   pos = size = next->get_size (next);
 156   if (pos == -1) {
 157     nbdkit_error ("xz: get_size: %m");
 158     goto err;
 159   }
 160   if ((pos & 3) != 0) {
 161     nbdkit_error ("xz: not an xz file: size is not a multiple of 4 bytes");
 162     goto err;
 163   }
 164
 165   /* Jump backwards through the file identifying each stream. */
 166   while (pos > 0) {
 167     nbdkit_debug ("looping through streams: pos = %" PRIi64, pos);
 168
 169     if (pos < LZMA_STREAM_HEADER_SIZE) {
 170       nbdkit_error ("xz: corrupted file at %" PRIi64, pos);
 171       goto err;
 172     }
 173
 174     if (next->pread (next, footer, LZMA_STREAM_HEADER_SIZE,
 175                      pos - LZMA_STREAM_HEADER_SIZE, 0, &err) == -1) {
 176       nbdkit_error ("xz: read stream footer: error %d", err);
 177       goto err;
 178     }
 179     /* Skip stream padding. */
 180     if (footer[8] == 0 && footer[9] == 0 &&
 181         footer[10] == 0 && footer[11] == 0) {
 182       stream_padding += 4;
 183       pos -= 4;
 184       continue;
 185     }
 186
 187     pos -= LZMA_STREAM_HEADER_SIZE;
 188     (*nr_streams)++;
 189
 190     nbdkit_debug ("decode stream footer at pos = %" PRIi64, pos);
 191
 192     /* Does the stream footer look reasonable? */
 193     r = lzma_stream_footer_decode (&footer_flags, footer);
 194     if (r != LZMA_OK) {
 195       nbdkit_error ("xz: invalid stream footer (error %d)", r);
 196       goto err;
 197     }
 198     nbdkit_debug ("backward_size = %" PRIu64,
 199                   (uint64_t) footer_flags.backward_size);
 200     index_size = footer_flags.backward_size;
 201     if (pos < index_size + LZMA_STREAM_HEADER_SIZE) {
 202       nbdkit_error ("xz: invalid stream footer");
 203       goto err;
 204     }
 205
 206     pos -= index_size;
 207     nbdkit_debug ("decode index at pos = %" PRIi64, pos);
 208
 209     /* Decode the index. */
 210     r = lzma_index_decoder (&strm, &this_index, UINT64_MAX);
 211     if (r != LZMA_OK) {
 212       nbdkit_error ("xz: invalid stream index (error %d)", r);
 213       goto err;
 214     }
 215
 216     offs = pos;
 217     do {
 218       uint8_t buf[BUFSIZ];
 219
 220       strm.avail_in = index_size;
 221       if (strm.avail_in > BUFSIZ)
 222         strm.avail_in = BUFSIZ;
 223       if (pos + strm.avail_in > size)
 224         strm.avail_in = size - pos;
 225
 226       if (next->pread (next, buf, strm.avail_in, offs, 0, &err) == -1) {
 227         nbdkit_error ("xz: read index: error %d", err);
 228         goto err;
 229       }
 230       offs += strm.avail_in;
 231       index_size -= strm.avail_in;
 232
 233       strm.next_in = buf;
 234       r = lzma_code (&strm, LZMA_RUN);
 235     } while (r == LZMA_OK);
 236
 237     if (r != LZMA_STREAM_END) {
 238       nbdkit_error ("xz: could not parse index (error %d)", r);
 239       goto err;
 240     }
 241
 242     pos -= lzma_index_total_size (this_index) + LZMA_STREAM_HEADER_SIZE;
 243
 244     nbdkit_debug ("decode stream header at pos = %" PRIi64, pos);
 245
 246     /* Read and decode the stream header. */
 247     if (next->pread (next, header, LZMA_STREAM_HEADER_SIZE, pos, 0,
 248                      &err) == -1) {
 249       nbdkit_error ("xz: read stream header: error %d", err);
 250       goto err;
 251     }
 252
 253     r = lzma_stream_header_decode (&header_flags, header);
 254     if (r != LZMA_OK) {
 255       nbdkit_error ("xz: invalid stream header (error %d)", r);
 256       goto err;
 257     }
 258
 259     /* Header and footer of the stream should be equal. */
 260     r = lzma_stream_flags_compare (&header_flags, &footer_flags);
 261     if (r != LZMA_OK) {
 262       nbdkit_error ("xz: header and footer of stream are not equal (error %d)",
 263                     r);
 264       goto err;
 265     }
 266
 267     /* Store the decoded stream flags in this_index. */
 268     r = lzma_index_stream_flags (this_index, &footer_flags);
 269     if (r != LZMA_OK) {
 270       nbdkit_error ("xz: cannot read stream_flags from index (error %d)", r);
 271       goto err;
 272     }
 273
 274     /* Store the amount of stream padding so far.  Needed to calculate
 275      * compressed offsets correctly in multi-stream files.
 276      */
 277     r = lzma_index_stream_padding (this_index, stream_padding);
 278     if (r != LZMA_OK) {
 279       nbdkit_error ("xz: cannot set stream_padding in index (error %d)", r);
 280       goto err;
 281     }
 282
 283     if (combined_index != NULL) {
 284       r = lzma_index_cat (this_index, combined_index, NULL);
 285       if (r != LZMA_OK) {
 286         nbdkit_error ("xz: cannot combine indexes");
 287         goto err;
 288       }
 289     }
 290
 291     combined_index = this_index;
 292     this_index = NULL;
 293   }
 294
 295   lzma_end (&strm);
 296
 297   return combined_index;
 298
 299  err:
 300   lzma_end (&strm);
 301   lzma_index_end (this_index, NULL);
 302   lzma_index_end (combined_index, NULL);
 303   return NULL;
 304 }
 305
 306 /* Iterate over the indexes to find the number of blocks and
 307  * the largest block.
 308  */
 309 static int
 310 iter_indexes (lzma_index *idx,
 311               size_t *nr_blocks, uint64_t *max_uncompressed_block_size)
 312 {
 313   lzma_index_iter iter;
 314
 315   *nr_blocks = 0;
 316   *max_uncompressed_block_size = 0;
 317
 318   lzma_index_iter_init (&iter, idx);
 319   while (!lzma_index_iter_next (&iter, LZMA_INDEX_ITER_NONEMPTY_BLOCK)) {
 320     if (iter.block.uncompressed_size > *max_uncompressed_block_size)
 321       *max_uncompressed_block_size = iter.block.uncompressed_size;
 322     (*nr_blocks)++;
 323   }
 324
 325   return 0;
 326 }
 327
 328 void
 329 xzfile_close (xzfile *xz)
 330 {
 331   if (xz) {
 332     lzma_index_end (xz->idx, NULL);
 333     free (xz);
 334   }
 335 }
 336
 337 uint64_t
 338 xzfile_max_uncompressed_block_size (xzfile *xz)
 339 {
 340   return xz->max_uncompressed_block_size;
 341 }
 342
 343 uint64_t
 344 xzfile_get_size (xzfile *xz)
 345 {
 346   return lzma_index_uncompressed_size (xz->idx);
 347 }
 348
 349 char *
 350 xzfile_read_block (xzfile *xz,
 351                    nbdkit_next *next,
 352                    uint32_t flags, int *err,
 353                    uint64_t offset,
 354                    uint64_t *start_rtn, uint64_t *size_rtn)
 355 {
 356   int64_t offs, size;
 357   lzma_index_iter iter;
 358   uint8_t header[LZMA_BLOCK_HEADER_SIZE_MAX];
 359   lzma_block block;
 360   lzma_filter filters[LZMA_FILTERS_MAX + 1];
 361   lzma_ret r;
 362   lzma_stream strm = LZMA_STREAM_INIT;
 363   const size_t bufsize = 1024 * 1024;
 364   CLEANUP_FREE unsigned char *buf = NULL;
 365   char *data = NULL;
 366   size_t i;
 367
 368   /* Read the total size of the underlying disk, so we don't
 369    * read over the end.
 370    */
 371   size = next->get_size (next);
 372   if (size == -1) {
 373     nbdkit_error ("xz: get_size: %m");
 374     return NULL;
 375   }
 376
 377   /* Locate the block containing the uncompressed offset. */
 378   lzma_index_iter_init (&iter, xz->idx);
 379   if (lzma_index_iter_locate (&iter, offset)) {
 380     nbdkit_error ("cannot find offset %" PRIu64 " in the xz file", offset);
 381     return NULL;
 382   }
 383
 384   *start_rtn = iter.block.uncompressed_file_offset;
 385   *size_rtn = iter.block.uncompressed_size;
 386
 387   nbdkit_debug ("seek: block number %d at file offset %" PRIu64,
 388                 (int) iter.block.number_in_file,
 389                 (uint64_t) iter.block.compressed_file_offset);
 390
 391   /* Read the block header.  Start by reading a single byte which
 392    * tell us how big the block header is.
 393    */
 394   offs = iter.block.compressed_file_offset;
 395   if (next->pread (next, header, 1, offs, 0, err) == -1) {
 396     nbdkit_error ("xz: read: could not read block header byte: error %d", *err);
 397     return NULL;
 398   }
 399   offs++;
 400
 401   if (header[0] == '\0') {
 402     nbdkit_error ("xz: read: unexpected invalid block in file, header[0] = 0");
 403     return NULL;
 404   }
 405
 406   block.version = 0;
 407   block.check = iter.stream.flags->check;
 408   block.filters = filters;
 409   block.header_size = lzma_block_header_size_decode (header[0]);
 410
 411   /* Now read and decode the block header. */
 412   if (next->pread (next, &header[1], block.header_size-1, offs,
 413                    0, err) == -1) {
 414     nbdkit_error ("xz: read: could not read block of compressed data: "
 415                   "error %d", *err);
 416     return NULL;
 417   }
 418   offs += block.header_size - 1;
 419
 420   r = lzma_block_header_decode (&block, NULL, header);
 421   if (r != LZMA_OK) {
 422     nbdkit_error ("invalid block header (error %d)", r);
 423     return NULL;
 424   }
 425
 426   /* What this actually does is it checks that the block header
 427    * matches the index.
 428    */
 429   r = lzma_block_compressed_size (&block, iter.block.unpadded_size);
 430   if (r != LZMA_OK) {
 431     nbdkit_error ("cannot calculate compressed size (error %d)", r);
 432     goto err1;
 433   }
 434
 435   /* Read the block data. */
 436   r = lzma_block_decoder (&strm, &block);
 437   if (r != LZMA_OK) {
 438     nbdkit_error ("invalid block (error %d)", r);
 439     goto err1;
 440   }
 441
 442   data = malloc (*size_rtn);
 443   if (data == NULL) {
 444     nbdkit_error ("malloc (%" PRIu64 " bytes): %m\n"
 445                   "NOTE: If this error occurs, you need to recompress your "
 446                   "xz files with a smaller block size.  "
 447                   "Use: 'xz --block-size=16777216 ...'.",
 448                   *size_rtn);
 449     goto err2;
 450   }
 451
 452   buf = malloc (bufsize);
 453   if (buf == NULL) {
 454     nbdkit_error ("malloc: %m");
 455     goto err2;
 456   }
 457
 458   strm.next_in = NULL;
 459   strm.avail_in = 0;
 460   strm.next_out = (uint8_t *) data;
 461   strm.avail_out = block.uncompressed_size;
 462   do {
 463     if (strm.avail_in == 0) {
 464       strm.avail_in = bufsize;
 465       if (offs + strm.avail_in > size)
 466         strm.avail_in = size - offs;
 467       if (strm.avail_in > 0) {
 468         strm.next_in = buf;
 469         if (next->pread (next, buf, strm.avail_in, offs, 0, err) == -1) {
 470           nbdkit_error ("xz: read: error %d", *err);
 471           goto err2;
 472         }
 473         offs += strm.avail_in;
 474       }
 475     }
 476
 477     r = lzma_code (&strm, LZMA_RUN);
 478   } while (r == LZMA_OK);
 479
 480   if (r != LZMA_OK && r != LZMA_STREAM_END) {
 481     nbdkit_error ("could not parse block data (error %d)", r);
 482     goto err2;
 483   }
 484
 485   lzma_end (&strm);
 486
 487   for (i = 0; filters[i].id != LZMA_VLI_UNKNOWN; ++i)
 488     free (filters[i].options);
 489
 490   return data;
 491
 492  err2:
 493   lzma_end (&strm);
 494  err1:
 495   for (i = 0; filters[i].id != LZMA_VLI_UNKNOWN; ++i)
 496     free (filters[i].options);
 497
 498   free (data);
 499
 500   return NULL;
 501 }