plugins/xz/xzfile.c

   1 /* nbdkit
   2  * Copyright (C) 2013 Red Hat Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions are
   7  * met:
   8  *
   9  * * Redistributions of source code must retain the above copyright
  10  * notice, this list of conditions and the following disclaimer.
  11  *
  12  * * Redistributions in binary form must reproduce the above copyright
  13  * notice, this list of conditions and the following disclaimer in the
  14  * documentation and/or other materials provided with the distribution.
  15  *
  16  * * Neither the name of Red Hat nor the names of its contributors may be
  17  * used to endorse or promote products derived from this software without
  18  * specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  22  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  23  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
  24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  27  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  28  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 /* liblzma is a complex interface, so abstract it here. */
  35
  36 #include <config.h>
  37
  38 #include <stdio.h>
  39 #include <stdlib.h>
  40 #include <string.h>
  41 #include <stdint.h>
  42 #include <inttypes.h>
  43 #include <unistd.h>
  44 #include <fcntl.h>
  45 #include <sys/types.h>
  46
  47 #include <nbdkit-plugin.h>
  48
  49 #include <lzma.h>
  50
  51 #include "xzfile.h"
  52
  53 #define XZ_HEADER_MAGIC     "\xfd" "7zXZ\0"
  54 #define XZ_HEADER_MAGIC_LEN 6
  55 #define XZ_FOOTER_MAGIC     "YZ"
  56 #define XZ_FOOTER_MAGIC_LEN 2
  57
  58 struct xzfile {
  59   int fd;
  60   lzma_index *idx;
  61   size_t nr_streams;
  62   size_t nr_blocks;
  63   uint64_t max_uncompressed_block_size;
  64 };
  65
  66 static int check_header_magic (int fd);
  67 static lzma_index *parse_indexes (const char *filename, int fd, size_t *);
  68 static int iter_indexes (lzma_index *idx, size_t *, uint64_t *);
  69
  70 xzfile *
  71 xzfile_open (const char *filename)
  72 {
  73   xzfile *xz;
  74   uint64_t size;
  75
  76   xz = malloc (sizeof *xz);
  77   if (xz == NULL) {
  78     nbdkit_error ("malloc: %m");
  79     return NULL;
  80   }
  81
  82   /* Open the file. */
  83   xz->fd = open (filename, O_RDONLY|O_CLOEXEC);
  84   if (xz->fd == -1) {
  85     nbdkit_error ("%s: %m", filename);
  86     goto err1;
  87   }
  88
  89   /* Check file magic. */
  90   if (!check_header_magic (xz->fd)) {
  91     nbdkit_error ("%s: not an xz file", filename);
  92     goto err2;
  93   }
  94
  95   /* Read and parse the indexes. */
  96   xz->idx = parse_indexes (filename, xz->fd, &xz->nr_streams);
  97   if (xz->idx == NULL)
  98     goto err2;
  99
 100   /* Iterate over indexes to find the number of and largest block. */
 101   if (iter_indexes (xz->idx,
 102                     &xz->nr_blocks, &xz->max_uncompressed_block_size) == -1)
 103     goto err2;
 104
 105   size = lzma_index_uncompressed_size (xz->idx);
 106   nbdkit_debug ("%s: size %" PRIu64 " bytes (%.1fM)",
 107                 filename, size, size / 1024.0 / 1024.0);
 108   nbdkit_debug ("%s: %zu streams, %zu blocks", filename,
 109                 xz->nr_streams, xz->nr_blocks);
 110   nbdkit_debug ("%s: maximum uncompressed block size %" PRIu64 " bytes (%.1fM)",
 111                 filename,
 112                 xz->max_uncompressed_block_size,
 113                 xz->max_uncompressed_block_size / 1024.0 / 1024.0);
 114
 115   return xz;
 116
 117  err2:
 118   close (xz->fd);
 119  err1:
 120   free (xz);
 121   return NULL;
 122 }
 123
 124 static int
 125 check_header_magic (int fd)
 126 {
 127   char buf[XZ_HEADER_MAGIC_LEN];
 128
 129   if (lseek (fd, 0, SEEK_SET) == -1)
 130     return 0;
 131   if (read (fd, buf, XZ_HEADER_MAGIC_LEN) != XZ_HEADER_MAGIC_LEN)
 132     return 0;
 133   if (memcmp (buf, XZ_HEADER_MAGIC, XZ_HEADER_MAGIC_LEN) != 0)
 134     return 0;
 135   return 1;
 136 }
 137
 138 /* For explanation of this function, see src/xz/list.c:parse_indexes
 139  * in the xz sources.
 140  */
 141 static lzma_index *
 142 parse_indexes (const char *filename, int fd, size_t *nr_streams)
 143 {
 144   lzma_ret r;
 145   off_t pos, index_size;
 146   uint8_t footer[LZMA_STREAM_HEADER_SIZE];
 147   uint8_t header[LZMA_STREAM_HEADER_SIZE];
 148   lzma_stream_flags footer_flags;
 149   lzma_stream_flags header_flags;
 150   lzma_stream strm = LZMA_STREAM_INIT;
 151   ssize_t n;
 152   lzma_index *combined_index = NULL;
 153   lzma_index *this_index = NULL;
 154   lzma_vli stream_padding = 0;
 155
 156   *nr_streams = 0;
 157
 158   /* Check file size is a multiple of 4 bytes. */
 159   pos = lseek (fd, 0, SEEK_END);
 160   if (pos == (off_t) -1) {
 161     nbdkit_error ("%s: lseek: %m", filename);
 162     goto err;
 163   }
 164   if ((pos & 3) != 0) {
 165     nbdkit_error ("%s: not an xz file: size is not a multiple of 4 bytes",
 166                   filename);
 167     goto err;
 168   }
 169
 170   /* Jump backwards through the file identifying each stream. */
 171   while (pos > 0) {
 172     nbdkit_debug ("looping through streams: pos = %" PRIu64, (uint64_t) pos);
 173
 174     if (pos < LZMA_STREAM_HEADER_SIZE) {
 175       nbdkit_error ("%s: corrupted file at %" PRIu64, filename, (uint64_t) pos);
 176       goto err;
 177     }
 178
 179     if (lseek (fd, -LZMA_STREAM_HEADER_SIZE, SEEK_CUR) == -1) {
 180       nbdkit_error ("%s: lseek: %m", filename);
 181       goto err;
 182     }
 183     if (read (fd, footer, LZMA_STREAM_HEADER_SIZE) != LZMA_STREAM_HEADER_SIZE) {
 184       nbdkit_error ("%s: read stream footer: %m", filename);
 185       goto err;
 186     }
 187     /* Skip stream padding. */
 188     if (footer[8] == 0 && footer[9] == 0 &&
 189         footer[10] == 0 && footer[11] == 0) {
 190       stream_padding += 4;
 191       pos -= 4;
 192       continue;
 193     }
 194
 195     pos -= LZMA_STREAM_HEADER_SIZE;
 196     (*nr_streams)++;
 197
 198     nbdkit_debug ("decode stream footer at pos = %" PRIu64, (uint64_t) pos);
 199
 200     /* Does the stream footer look reasonable? */
 201     r = lzma_stream_footer_decode (&footer_flags, footer);
 202     if (r != LZMA_OK) {
 203       nbdkit_error ("%s: invalid stream footer (error %d)", filename, r);
 204       goto err;
 205     }
 206     nbdkit_debug ("backward_size = %" PRIu64,
 207                   (uint64_t) footer_flags.backward_size);
 208     index_size = footer_flags.backward_size;
 209     if (pos < index_size + LZMA_STREAM_HEADER_SIZE) {
 210       nbdkit_error ("%s: invalid stream footer", filename);
 211       goto err;
 212     }
 213
 214     pos -= index_size;
 215     nbdkit_debug ("decode index at pos = %" PRIu64, (uint64_t) pos);
 216
 217     /* Seek backwards to the index of this stream. */
 218     if (lseek (fd, pos, SEEK_SET) == -1) {
 219       nbdkit_error ("%s: lseek: %m", filename);
 220       goto err;
 221     }
 222
 223     /* Decode the index. */
 224     r = lzma_index_decoder (&strm, &this_index, UINT64_MAX);
 225     if (r != LZMA_OK) {
 226       nbdkit_error ("%s: invalid stream index (error %d)", filename, r);
 227       goto err;
 228     }
 229
 230     do {
 231       uint8_t buf[BUFSIZ];
 232
 233       strm.avail_in = index_size;
 234       if (strm.avail_in > BUFSIZ)
 235         strm.avail_in = BUFSIZ;
 236
 237       n = read (fd, &buf, strm.avail_in);
 238       if (n == -1) {
 239         nbdkit_error ("read: %m");
 240         goto err;
 241       }
 242       index_size -= strm.avail_in;
 243
 244       strm.next_in = buf;
 245       r = lzma_code (&strm, LZMA_RUN);
 246     } while (r == LZMA_OK);
 247
 248     if (r != LZMA_STREAM_END) {
 249       nbdkit_error ("%s: could not parse index (error %d)",
 250                     filename, r);
 251       goto err;
 252     }
 253
 254     pos -= lzma_index_total_size (this_index) + LZMA_STREAM_HEADER_SIZE;
 255
 256     nbdkit_debug ("decode stream header at pos = %" PRIu64, (uint64_t) pos);
 257
 258     /* Read and decode the stream header. */
 259     if (lseek (fd, pos, SEEK_SET) == -1) {
 260       nbdkit_error ("%s: lseek: %m", filename);
 261       goto err;
 262     }
 263     if (read (fd, header, LZMA_STREAM_HEADER_SIZE) != LZMA_STREAM_HEADER_SIZE) {
 264       nbdkit_error ("%s: read stream header: %m", filename);
 265       goto err;
 266     }
 267
 268     r = lzma_stream_header_decode (&header_flags, header);
 269     if (r != LZMA_OK) {
 270       nbdkit_error ("%s: invalid stream header (error %d)", filename, r);
 271       goto err;
 272     }
 273
 274     /* Header and footer of the stream should be equal. */
 275     r = lzma_stream_flags_compare (&header_flags, &footer_flags);
 276     if (r != LZMA_OK) {
 277       nbdkit_error ("%s: header and footer of stream are not equal (error %d)",
 278                     filename, r);
 279       goto err;
 280     }
 281
 282     /* Store the decoded stream flags in this_index. */
 283     r = lzma_index_stream_flags (this_index, &footer_flags);
 284     if (r != LZMA_OK) {
 285       nbdkit_error ("%s: cannot read stream_flags from index (error %d)",
 286                     filename, r);
 287       goto err;
 288     }
 289
 290     /* Store the amount of stream padding so far.  Needed to calculate
 291      * compressed offsets correctly in multi-stream files.
 292      */
 293     r = lzma_index_stream_padding (this_index, stream_padding);
 294     if (r != LZMA_OK) {
 295       nbdkit_error ("%s: cannot set stream_padding in index (error %d)",
 296                     filename, r);
 297       goto err;
 298     }
 299
 300     if (combined_index != NULL) {
 301       r = lzma_index_cat (this_index, combined_index, NULL);
 302       if (r != LZMA_OK) {
 303         nbdkit_error ("%s: cannot combine indexes", filename);
 304         goto err;
 305       }
 306     }
 307
 308     combined_index = this_index;
 309     this_index = NULL;
 310   }
 311
 312   lzma_end (&strm);
 313
 314   return combined_index;
 315
 316  err:
 317   lzma_end (&strm);
 318   lzma_index_end (this_index, NULL);
 319   lzma_index_end (combined_index, NULL);
 320   return NULL;
 321 }
 322
 323 /* Iterate over the indexes to find the number of blocks and
 324  * the largest block.
 325  */
 326 static int
 327 iter_indexes (lzma_index *idx,
 328               size_t *nr_blocks, uint64_t *max_uncompressed_block_size)
 329 {
 330   lzma_index_iter iter;
 331
 332   *nr_blocks = 0;
 333   *max_uncompressed_block_size = 0;
 334
 335   lzma_index_iter_init (&iter, idx);
 336   while (!lzma_index_iter_next (&iter, LZMA_INDEX_ITER_NONEMPTY_BLOCK)) {
 337     if (iter.block.uncompressed_size > *max_uncompressed_block_size)
 338       *max_uncompressed_block_size = iter.block.uncompressed_size;
 339     (*nr_blocks)++;
 340   }
 341
 342   return 0;
 343 }
 344
 345 void
 346 xzfile_close (xzfile *xz)
 347 {
 348   lzma_index_end (xz->idx, NULL);
 349   close (xz->fd);
 350   free (xz);
 351 }
 352
 353 uint64_t
 354 xzfile_max_uncompressed_block_size (xzfile *xz)
 355 {
 356   return xz->max_uncompressed_block_size;
 357 }
 358
 359 uint64_t
 360 xzfile_get_size (xzfile *xz)
 361 {
 362   return lzma_index_uncompressed_size (xz->idx);
 363 }
 364
 365 char *
 366 xzfile_read_block (xzfile *xz, uint64_t offset,
 367                    uint64_t *start_rtn, uint64_t *size_rtn)
 368 {
 369   lzma_index_iter iter;
 370   uint8_t header[LZMA_BLOCK_HEADER_SIZE_MAX];
 371   lzma_block block;
 372   lzma_filter filters[LZMA_FILTERS_MAX + 1];
 373   lzma_ret r;
 374   lzma_stream strm = LZMA_STREAM_INIT;
 375   char *data;
 376   ssize_t n;
 377   size_t i;
 378
 379   /* Locate the block containing the uncompressed offset. */
 380   lzma_index_iter_init (&iter, xz->idx);
 381   if (lzma_index_iter_locate (&iter, offset)) {
 382     nbdkit_error ("cannot find offset %" PRIu64 " in the xz file", offset);
 383     return NULL;
 384   }
 385
 386   *start_rtn = iter.block.uncompressed_file_offset;
 387   *size_rtn = iter.block.uncompressed_size;
 388
 389   nbdkit_debug ("seek: block number %d at file offset %" PRIu64,
 390                 (int) iter.block.number_in_file,
 391                 (uint64_t) iter.block.compressed_file_offset);
 392
 393   if (lseek (xz->fd, iter.block.compressed_file_offset, SEEK_SET) == -1) {
 394     nbdkit_error ("lseek: %m");
 395     return NULL;
 396   }
 397
 398   /* Read the block header.  Start by reading a single byte which
 399    * tell us how big the block header is.
 400    */
 401   n = read (xz->fd, header, 1);
 402   if (n == 0) {
 403     nbdkit_error ("read: unexpected end of file reading block header byte");
 404     return NULL;
 405   }
 406   if (n == -1) {
 407     nbdkit_error ("read: %m");
 408     return NULL;
 409   }
 410
 411   if (header[0] == '\0') {
 412     nbdkit_error ("read: unexpected invalid block in file, header[0] = 0");
 413     return NULL;
 414   }
 415
 416   block.version = 0;
 417   block.check = iter.stream.flags->check;
 418   block.filters = filters;
 419   block.header_size = lzma_block_header_size_decode (header[0]);
 420
 421   /* Now read and decode the block header. */
 422   n = read (xz->fd, &header[1], block.header_size-1);
 423   if (n >= 0 && n != block.header_size-1) {
 424     nbdkit_error ("read: unexpected end of file reading block header");
 425     return NULL;
 426   }
 427   if (n == -1) {
 428     nbdkit_error ("read: %m");
 429     return NULL;
 430   }
 431
 432   r = lzma_block_header_decode (&block, NULL, header);
 433   if (r != LZMA_OK) {
 434     nbdkit_error ("invalid block header (error %d)", r);
 435     return NULL;
 436   }
 437
 438   /* What this actually does is it checks that the block header
 439    * matches the index.
 440    */
 441   r = lzma_block_compressed_size (&block, iter.block.unpadded_size);
 442   if (r != LZMA_OK) {
 443     nbdkit_error ("cannot calculate compressed size (error %d)", r);
 444     goto err1;
 445   }
 446
 447   /* Read the block data. */
 448   r = lzma_block_decoder (&strm, &block);
 449   if (r != LZMA_OK) {
 450     nbdkit_error ("invalid block (error %d)", r);
 451     goto err1;
 452   }
 453
 454   data = malloc (*size_rtn);
 455   if (data == NULL) {
 456     nbdkit_error ("malloc (%zu bytes): %m\n"
 457                   "NOTE: If this error occurs, you need to recompress your xz files with a smaller block size.  Use: 'xz --block-size=16777216 ...'.",
 458                   *size_rtn);
 459     goto err1;
 460   }
 461
 462   strm.next_in = NULL;
 463   strm.avail_in = 0;
 464   strm.next_out = (uint8_t *) data;
 465   strm.avail_out = block.uncompressed_size;
 466
 467   do {
 468     uint8_t buf[BUFSIZ];
 469     lzma_action action = LZMA_RUN;
 470
 471     if (strm.avail_in == 0) {
 472       strm.next_in = buf;
 473       n = read (xz->fd, buf, sizeof buf);
 474       if (n == -1) {
 475         nbdkit_error ("read: %m");
 476         goto err2;
 477       }
 478       strm.avail_in = n;
 479       if (n == 0)
 480         action = LZMA_FINISH;
 481     }
 482
 483     strm.avail_in = n;
 484     strm.next_in = buf;
 485     r = lzma_code (&strm, action);
 486   } while (r == LZMA_OK);
 487
 488   if (r != LZMA_OK && r != LZMA_STREAM_END) {
 489     nbdkit_error ("could not parse block data (error %d)", r);
 490     goto err2;
 491   }
 492
 493   lzma_end (&strm);
 494
 495   for (i = 0; filters[i].id != LZMA_VLI_UNKNOWN; ++i)
 496     free (filters[i].options);
 497
 498   return data;
 499
 500  err2:
 501   free (data);
 502   lzma_end (&strm);
 503  err1:
 504   for (i = 0; filters[i].id != LZMA_VLI_UNKNOWN; ++i)
 505     free (filters[i].options);
 506
 507   return NULL;
 508 }