block/qcow2-cluster.c

   1 /*
   2  * Block driver for the QCOW version 2 format
   3  *
   4  * Copyright (c) 2004-2006 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include <zlib.h>
  26
  27 #include "qemu-common.h"
  28 #include "block/block_int.h"
  29 #include "block/qcow2.h"
  30 #include "trace.h"
  31
  32 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
  33                         bool exact_size)
  34 {
  35     BDRVQcowState *s = bs->opaque;
  36     int new_l1_size2, ret, i;
  37     uint64_t *new_l1_table;
  38     int64_t new_l1_table_offset, new_l1_size;
  39     uint8_t data[12];
  40
  41     if (min_size <= s->l1_size)
  42         return 0;
  43
  44     if (exact_size) {
  45         new_l1_size = min_size;
  46     } else {
  47         /* Bump size up to reduce the number of times we have to grow */
  48         new_l1_size = s->l1_size;
  49         if (new_l1_size == 0) {
  50             new_l1_size = 1;
  51         }
  52         while (min_size > new_l1_size) {
  53             new_l1_size = (new_l1_size * 3 + 1) / 2;
  54         }
  55     }
  56
  57     if (new_l1_size > INT_MAX) {
  58         return -EFBIG;
  59     }
  60
  61 #ifdef DEBUG_ALLOC2
  62     fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
  63             s->l1_size, new_l1_size);
  64 #endif
  65
  66     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
  67     new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
  68     memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
  69
  70     /* write new table (align to cluster) */
  71     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
  72     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
  73     if (new_l1_table_offset < 0) {
  74         g_free(new_l1_table);
  75         return new_l1_table_offset;
  76     }
  77
  78     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
  79     if (ret < 0) {
  80         goto fail;
  81     }
  82
  83     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
  84     for(i = 0; i < s->l1_size; i++)
  85         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
  86     ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
  87     if (ret < 0)
  88         goto fail;
  89     for(i = 0; i < s->l1_size; i++)
  90         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
  91
  92     /* set new table */
  93     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
  94     cpu_to_be32w((uint32_t*)data, new_l1_size);
  95     cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
  96     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
  97     if (ret < 0) {
  98         goto fail;
  99     }
 100     g_free(s->l1_table);
 101     qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
 102                         QCOW2_DISCARD_OTHER);
 103     s->l1_table_offset = new_l1_table_offset;
 104     s->l1_table = new_l1_table;
 105     s->l1_size = new_l1_size;
 106     return 0;
 107  fail:
 108     g_free(new_l1_table);
 109     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
 110                         QCOW2_DISCARD_OTHER);
 111     return ret;
 112 }
 113
 114 /*
 115  * l2_load
 116  *
 117  * Loads a L2 table into memory. If the table is in the cache, the cache
 118  * is used; otherwise the L2 table is loaded from the image file.
 119  *
 120  * Returns a pointer to the L2 table on success, or NULL if the read from
 121  * the image file failed.
 122  */
 123
 124 static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
 125     uint64_t **l2_table)
 126 {
 127     BDRVQcowState *s = bs->opaque;
 128     int ret;
 129
 130     ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
 131
 132     return ret;
 133 }
 134
 135 /*
 136  * Writes one sector of the L1 table to the disk (can't update single entries
 137  * and we really don't want bdrv_pread to perform a read-modify-write)
 138  */
 139 #define L1_ENTRIES_PER_SECTOR (512 / 8)
 140 static int write_l1_entry(BlockDriverState *bs, int l1_index)
 141 {
 142     BDRVQcowState *s = bs->opaque;
 143     uint64_t buf[L1_ENTRIES_PER_SECTOR];
 144     int l1_start_index;
 145     int i, ret;
 146
 147     l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
 148     for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
 149         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
 150     }
 151
 152     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
 153     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
 154         buf, sizeof(buf));
 155     if (ret < 0) {
 156         return ret;
 157     }
 158
 159     return 0;
 160 }
 161
 162 /*
 163  * l2_allocate
 164  *
 165  * Allocate a new l2 entry in the file. If l1_index points to an already
 166  * used entry in the L2 table (i.e. we are doing a copy on write for the L2
 167  * table) copy the contents of the old L2 table into the newly allocated one.
 168  * Otherwise the new table is initialized with zeros.
 169  *
 170  */
 171
 172 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 173 {
 174     BDRVQcowState *s = bs->opaque;
 175     uint64_t old_l2_offset;
 176     uint64_t *l2_table;
 177     int64_t l2_offset;
 178     int ret;
 179
 180     old_l2_offset = s->l1_table[l1_index];
 181
 182     trace_qcow2_l2_allocate(bs, l1_index);
 183
 184     /* allocate a new l2 entry */
 185
 186     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
 187     if (l2_offset < 0) {
 188         return l2_offset;
 189     }
 190
 191     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 192     if (ret < 0) {
 193         goto fail;
 194     }
 195
 196     /* allocate a new entry in the l2 cache */
 197
 198     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
 199     ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
 200     if (ret < 0) {
 201         return ret;
 202     }
 203
 204     l2_table = *table;
 205
 206     if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
 207         /* if there was no old l2 table, clear the new table */
 208         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 209     } else {
 210         uint64_t* old_table;
 211
 212         /* if there was an old l2 table, read it from the disk */
 213         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
 214         ret = qcow2_cache_get(bs, s->l2_table_cache,
 215             old_l2_offset & L1E_OFFSET_MASK,
 216             (void**) &old_table);
 217         if (ret < 0) {
 218             goto fail;
 219         }
 220
 221         memcpy(l2_table, old_table, s->cluster_size);
 222
 223         ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
 224         if (ret < 0) {
 225             goto fail;
 226         }
 227     }
 228
 229     /* write the l2 table to the file */
 230     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 231
 232     trace_qcow2_l2_allocate_write_l2(bs, l1_index);
 233     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 234     ret = qcow2_cache_flush(bs, s->l2_table_cache);
 235     if (ret < 0) {
 236         goto fail;
 237     }
 238
 239     /* update the L1 entry */
 240     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
 241     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
 242     ret = write_l1_entry(bs, l1_index);
 243     if (ret < 0) {
 244         goto fail;
 245     }
 246
 247     *table = l2_table;
 248     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
 249     return 0;
 250
 251 fail:
 252     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
 253     qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
 254     s->l1_table[l1_index] = old_l2_offset;
 255     return ret;
 256 }
 257
 258 /*
 259  * Checks how many clusters in a given L2 table are contiguous in the image
 260  * file. As soon as one of the flags in the bitmask stop_flags changes compared
 261  * to the first cluster, the search is stopped and the cluster is not counted
 262  * as contiguous. (This allows it, for example, to stop at the first compressed
 263  * cluster which may require a different handling)
 264  */
 265 static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
 266         uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
 267 {
 268     int i;
 269     uint64_t mask = stop_flags | L2E_OFFSET_MASK;
 270     uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
 271
 272     if (!offset)
 273         return 0;
 274
 275     for (i = start; i < start + nb_clusters; i++) {
 276         uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
 277         if (offset + (uint64_t) i * cluster_size != l2_entry) {
 278             break;
 279         }
 280     }
 281
 282         return (i - start);
 283 }
 284
 285 static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
 286 {
 287     int i;
 288
 289     for (i = 0; i < nb_clusters; i++) {
 290         int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
 291
 292         if (type != QCOW2_CLUSTER_UNALLOCATED) {
 293             break;
 294         }
 295     }
 296
 297     return i;
 298 }
 299
 300 /* The crypt function is compatible with the linux cryptoloop
 301    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
 302    supported */
 303 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 304                            uint8_t *out_buf, const uint8_t *in_buf,
 305                            int nb_sectors, int enc,
 306                            const AES_KEY *key)
 307 {
 308     union {
 309         uint64_t ll[2];
 310         uint8_t b[16];
 311     } ivec;
 312     int i;
 313
 314     for(i = 0; i < nb_sectors; i++) {
 315         ivec.ll[0] = cpu_to_le64(sector_num);
 316         ivec.ll[1] = 0;
 317         AES_cbc_encrypt(in_buf, out_buf, 512, key,
 318                         ivec.b, enc);
 319         sector_num++;
 320         in_buf += 512;
 321         out_buf += 512;
 322     }
 323 }
 324
 325 static int coroutine_fn copy_sectors(BlockDriverState *bs,
 326                                      uint64_t start_sect,
 327                                      uint64_t cluster_offset,
 328                                      int n_start, int n_end)
 329 {
 330     BDRVQcowState *s = bs->opaque;
 331     QEMUIOVector qiov;
 332     struct iovec iov;
 333     int n, ret;
 334
 335     /*
 336      * If this is the last cluster and it is only partially used, we must only
 337      * copy until the end of the image, or bdrv_check_request will fail for the
 338      * bdrv_read/write calls below.
 339      */
 340     if (start_sect + n_end > bs->total_sectors) {
 341         n_end = bs->total_sectors - start_sect;
 342     }
 343
 344     n = n_end - n_start;
 345     if (n <= 0) {
 346         return 0;
 347     }
 348
 349     iov.iov_len = n * BDRV_SECTOR_SIZE;
 350     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
 351
 352     qemu_iovec_init_external(&qiov, &iov, 1);
 353
 354     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 355
 356     /* Call .bdrv_co_readv() directly instead of using the public block-layer
 357      * interface.  This avoids double I/O throttling and request tracking,
 358      * which can lead to deadlock when block layer copy-on-read is enabled.
 359      */
 360     ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
 361     if (ret < 0) {
 362         goto out;
 363     }
 364
 365     if (s->crypt_method) {
 366         qcow2_encrypt_sectors(s, start_sect + n_start,
 367                         iov.iov_base, iov.iov_base, n, 1,
 368                         &s->aes_encrypt_key);
 369     }
 370
 371     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
 372     ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
 373     if (ret < 0) {
 374         goto out;
 375     }
 376
 377     ret = 0;
 378 out:
 379     qemu_vfree(iov.iov_base);
 380     return ret;
 381 }
 382
 383
 384 /*
 385  * get_cluster_offset
 386  *
 387  * For a given offset of the disk image, find the cluster offset in
 388  * qcow2 file. The offset is stored in *cluster_offset.
 389  *
 390  * on entry, *num is the number of contiguous sectors we'd like to
 391  * access following offset.
 392  *
 393  * on exit, *num is the number of contiguous sectors we can read.
 394  *
 395  * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
 396  * cases.
 397  */
 398 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 399     int *num, uint64_t *cluster_offset)
 400 {
 401     BDRVQcowState *s = bs->opaque;
 402     unsigned int l2_index;
 403     uint64_t l1_index, l2_offset, *l2_table;
 404     int l1_bits, c;
 405     unsigned int index_in_cluster, nb_clusters;
 406     uint64_t nb_available, nb_needed;
 407     int ret;
 408
 409     index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
 410     nb_needed = *num + index_in_cluster;
 411
 412     l1_bits = s->l2_bits + s->cluster_bits;
 413
 414     /* compute how many bytes there are between the offset and
 415      * the end of the l1 entry
 416      */
 417
 418     nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
 419
 420     /* compute the number of available sectors */
 421
 422     nb_available = (nb_available >> 9) + index_in_cluster;
 423
 424     if (nb_needed > nb_available) {
 425         nb_needed = nb_available;
 426     }
 427
 428     *cluster_offset = 0;
 429
 430     /* seek the the l2 offset in the l1 table */
 431
 432     l1_index = offset >> l1_bits;
 433     if (l1_index >= s->l1_size) {
 434         ret = QCOW2_CLUSTER_UNALLOCATED;
 435         goto out;
 436     }
 437
 438     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 439     if (!l2_offset) {
 440         ret = QCOW2_CLUSTER_UNALLOCATED;
 441         goto out;
 442     }
 443
 444     /* load the l2 table in memory */
 445
 446     ret = l2_load(bs, l2_offset, &l2_table);
 447     if (ret < 0) {
 448         return ret;
 449     }
 450
 451     /* find the cluster offset for the given disk offset */
 452
 453     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 454     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 455     nb_clusters = size_to_clusters(s, nb_needed << 9);
 456
 457     ret = qcow2_get_cluster_type(*cluster_offset);
 458     switch (ret) {
 459     case QCOW2_CLUSTER_COMPRESSED:
 460         /* Compressed clusters can only be processed one by one */
 461         c = 1;
 462         *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
 463         break;
 464     case QCOW2_CLUSTER_ZERO:
 465         if (s->qcow_version < 3) {
 466             return -EIO;
 467         }
 468         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
 469                 &l2_table[l2_index], 0,
 470                 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
 471         *cluster_offset = 0;
 472         break;
 473     case QCOW2_CLUSTER_UNALLOCATED:
 474         /* how many empty clusters ? */
 475         c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
 476         *cluster_offset = 0;
 477         break;
 478     case QCOW2_CLUSTER_NORMAL:
 479         /* how many allocated clusters ? */
 480         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
 481                 &l2_table[l2_index], 0,
 482                 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
 483         *cluster_offset &= L2E_OFFSET_MASK;
 484         break;
 485     default:
 486         abort();
 487     }
 488
 489     qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 490
 491     nb_available = (c * s->cluster_sectors);
 492
 493 out:
 494     if (nb_available > nb_needed)
 495         nb_available = nb_needed;
 496
 497     *num = nb_available - index_in_cluster;
 498
 499     return ret;
 500 }
 501
 502 /*
 503  * get_cluster_table
 504  *
 505  * for a given disk offset, load (and allocate if needed)
 506  * the l2 table.
 507  *
 508  * the l2 table offset in the qcow2 file and the cluster index
 509  * in the l2 table are given to the caller.
 510  *
 511  * Returns 0 on success, -errno in failure case
 512  */
 513 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 514                              uint64_t **new_l2_table,
 515                              int *new_l2_index)
 516 {
 517     BDRVQcowState *s = bs->opaque;
 518     unsigned int l2_index;
 519     uint64_t l1_index, l2_offset;
 520     uint64_t *l2_table = NULL;
 521     int ret;
 522
 523     /* seek the the l2 offset in the l1 table */
 524
 525     l1_index = offset >> (s->l2_bits + s->cluster_bits);
 526     if (l1_index >= s->l1_size) {
 527         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
 528         if (ret < 0) {
 529             return ret;
 530         }
 531     }
 532
 533     assert(l1_index < s->l1_size);
 534     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 535
 536     /* seek the l2 table of the given l2 offset */
 537
 538     if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
 539         /* load the l2 table in memory */
 540         ret = l2_load(bs, l2_offset, &l2_table);
 541         if (ret < 0) {
 542             return ret;
 543         }
 544     } else {
 545         /* First allocate a new L2 table (and do COW if needed) */
 546         ret = l2_allocate(bs, l1_index, &l2_table);
 547         if (ret < 0) {
 548             return ret;
 549         }
 550
 551         /* Then decrease the refcount of the old table */
 552         if (l2_offset) {
 553             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
 554                                 QCOW2_DISCARD_OTHER);
 555         }
 556     }
 557
 558     /* find the cluster offset for the given disk offset */
 559
 560     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 561
 562     *new_l2_table = l2_table;
 563     *new_l2_index = l2_index;
 564
 565     return 0;
 566 }
 567
 568 /*
 569  * alloc_compressed_cluster_offset
 570  *
 571  * For a given offset of the disk image, return cluster offset in
 572  * qcow2 file.
 573  *
 574  * If the offset is not found, allocate a new compressed cluster.
 575  *
 576  * Return the cluster offset if successful,
 577  * Return 0, otherwise.
 578  *
 579  */
 580
 581 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 582                                                uint64_t offset,
 583                                                int compressed_size)
 584 {
 585     BDRVQcowState *s = bs->opaque;
 586     int l2_index, ret;
 587     uint64_t *l2_table;
 588     int64_t cluster_offset;
 589     int nb_csectors;
 590
 591     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
 592     if (ret < 0) {
 593         return 0;
 594     }
 595
 596     /* Compression can't overwrite anything. Fail if the cluster was already
 597      * allocated. */
 598     cluster_offset = be64_to_cpu(l2_table[l2_index]);
 599     if (cluster_offset & L2E_OFFSET_MASK) {
 600         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 601         return 0;
 602     }
 603
 604     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
 605     if (cluster_offset < 0) {
 606         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 607         return 0;
 608     }
 609
 610     nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
 611                   (cluster_offset >> 9);
 612
 613     cluster_offset |= QCOW_OFLAG_COMPRESSED |
 614                       ((uint64_t)nb_csectors << s->csize_shift);
 615
 616     /* update L2 table */
 617
 618     /* compressed clusters never have the copied flag */
 619
 620     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
 621     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 622     l2_table[l2_index] = cpu_to_be64(cluster_offset);
 623     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 624     if (ret < 0) {
 625         return 0;
 626     }
 627
 628     return cluster_offset;
 629 }
 630
 631 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
 632 {
 633     BDRVQcowState *s = bs->opaque;
 634     int ret;
 635
 636     if (r->nb_sectors == 0) {
 637         return 0;
 638     }
 639
 640     qemu_co_mutex_unlock(&s->lock);
 641     ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
 642                        r->offset / BDRV_SECTOR_SIZE,
 643                        r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
 644     qemu_co_mutex_lock(&s->lock);
 645
 646     if (ret < 0) {
 647         return ret;
 648     }
 649
 650     /*
 651      * Before we update the L2 table to actually point to the new cluster, we
 652      * need to be sure that the refcounts have been increased and COW was
 653      * handled.
 654      */
 655     qcow2_cache_depends_on_flush(s->l2_table_cache);
 656
 657     return 0;
 658 }
 659
 660 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 661 {
 662     BDRVQcowState *s = bs->opaque;
 663     int i, j = 0, l2_index, ret;
 664     uint64_t *old_cluster, *l2_table;
 665     uint64_t cluster_offset = m->alloc_offset;
 666
 667     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
 668     assert(m->nb_clusters > 0);
 669
 670     old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
 671
 672     /* copy content of unmodified sectors */
 673     ret = perform_cow(bs, m, &m->cow_start);
 674     if (ret < 0) {
 675         goto err;
 676     }
 677
 678     ret = perform_cow(bs, m, &m->cow_end);
 679     if (ret < 0) {
 680         goto err;
 681     }
 682
 683     /* Update L2 table. */
 684     if (s->use_lazy_refcounts) {
 685         qcow2_mark_dirty(bs);
 686     }
 687     if (qcow2_need_accurate_refcounts(s)) {
 688         qcow2_cache_set_dependency(bs, s->l2_table_cache,
 689                                    s->refcount_block_cache);
 690     }
 691
 692     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
 693     if (ret < 0) {
 694         goto err;
 695     }
 696     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 697
 698     for (i = 0; i < m->nb_clusters; i++) {
 699         /* if two concurrent writes happen to the same unallocated cluster
 700          * each write allocates separate cluster and writes data concurrently.
 701          * The first one to complete updates l2 table with pointer to its
 702          * cluster the second one has to do RMW (which is done above by
 703          * copy_sectors()), update l2 table with its cluster pointer and free
 704          * old cluster. This is what this loop does */
 705         if(l2_table[l2_index + i] != 0)
 706             old_cluster[j++] = l2_table[l2_index + i];
 707
 708         l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
 709                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
 710      }
 711
 712
 713     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 714     if (ret < 0) {
 715         goto err;
 716     }
 717
 718     /*
 719      * If this was a COW, we need to decrease the refcount of the old cluster.
 720      * Also flush bs->file to get the right order for L2 and refcount update.
 721      *
 722      * Don't discard clusters that reach a refcount of 0 (e.g. compressed
 723      * clusters), the next write will reuse them anyway.
 724      */
 725     if (j != 0) {
 726         for (i = 0; i < j; i++) {
 727             qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
 728                                     QCOW2_DISCARD_NEVER);
 729         }
 730     }
 731
 732     ret = 0;
 733 err:
 734     g_free(old_cluster);
 735     return ret;
 736  }
 737
 738 /*
 739  * Returns the number of contiguous clusters that can be used for an allocating
 740  * write, but require COW to be performed (this includes yet unallocated space,
 741  * which must copy from the backing file)
 742  */
 743 static int count_cow_clusters(BDRVQcowState *s, int nb_clusters,
 744     uint64_t *l2_table, int l2_index)
 745 {
 746     int i;
 747
 748     for (i = 0; i < nb_clusters; i++) {
 749         uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
 750         int cluster_type = qcow2_get_cluster_type(l2_entry);
 751
 752         switch(cluster_type) {
 753         case QCOW2_CLUSTER_NORMAL:
 754             if (l2_entry & QCOW_OFLAG_COPIED) {
 755                 goto out;
 756             }
 757             break;
 758         case QCOW2_CLUSTER_UNALLOCATED:
 759         case QCOW2_CLUSTER_COMPRESSED:
 760         case QCOW2_CLUSTER_ZERO:
 761             break;
 762         default:
 763             abort();
 764         }
 765     }
 766
 767 out:
 768     assert(i <= nb_clusters);
 769     return i;
 770 }
 771
 772 /*
 773  * Check if there already is an AIO write request in flight which allocates
 774  * the same cluster. In this case we need to wait until the previous
 775  * request has completed and updated the L2 table accordingly.
 776  *
 777  * Returns:
 778  *   0       if there was no dependency. *cur_bytes indicates the number of
 779  *           bytes from guest_offset that can be read before the next
 780  *           dependency must be processed (or the request is complete)
 781  *
 782  *   -EAGAIN if we had to wait for another request, previously gathered
 783  *           information on cluster allocation may be invalid now. The caller
 784  *           must start over anyway, so consider *cur_bytes undefined.
 785  */
 786 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
 787     uint64_t *cur_bytes, QCowL2Meta **m)
 788 {
 789     BDRVQcowState *s = bs->opaque;
 790     QCowL2Meta *old_alloc;
 791     uint64_t bytes = *cur_bytes;
 792
 793     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 794
 795         uint64_t start = guest_offset;
 796         uint64_t end = start + bytes;
 797         uint64_t old_start = l2meta_cow_start(old_alloc);
 798         uint64_t old_end = l2meta_cow_end(old_alloc);
 799
 800         if (end <= old_start || start >= old_end) {
 801             /* No intersection */
 802         } else {
 803             if (start < old_start) {
 804                 /* Stop at the start of a running allocation */
 805                 bytes = old_start - start;
 806             } else {
 807                 bytes = 0;
 808             }
 809
 810             /* Stop if already an l2meta exists. After yielding, it wouldn't
 811              * be valid any more, so we'd have to clean up the old L2Metas
 812              * and deal with requests depending on them before starting to
 813              * gather new ones. Not worth the trouble. */
 814             if (bytes == 0 && *m) {
 815                 *cur_bytes = 0;
 816                 return 0;
 817             }
 818
 819             if (bytes == 0) {
 820                 /* Wait for the dependency to complete. We need to recheck
 821                  * the free/allocated clusters when we continue. */
 822                 qemu_co_mutex_unlock(&s->lock);
 823                 qemu_co_queue_wait(&old_alloc->dependent_requests);
 824                 qemu_co_mutex_lock(&s->lock);
 825                 return -EAGAIN;
 826             }
 827         }
 828     }
 829
 830     /* Make sure that existing clusters and new allocations are only used up to
 831      * the next dependency if we shortened the request above */
 832     *cur_bytes = bytes;
 833
 834     return 0;
 835 }
 836
 837 /*
 838  * Checks how many already allocated clusters that don't require a copy on
 839  * write there are at the given guest_offset (up to *bytes). If
 840  * *host_offset is not zero, only physically contiguous clusters beginning at
 841  * this host offset are counted.
 842  *
 843  * Note that guest_offset may not be cluster aligned. In this case, the
 844  * returned *host_offset points to exact byte referenced by guest_offset and
 845  * therefore isn't cluster aligned as well.
 846  *
 847  * Returns:
 848  *   0:     if no allocated clusters are available at the given offset.
 849  *          *bytes is normally unchanged. It is set to 0 if the cluster
 850  *          is allocated and doesn't need COW, but doesn't have the right
 851  *          physical offset.
 852  *
 853  *   1:     if allocated clusters that don't require a COW are available at
 854  *          the requested offset. *bytes may have decreased and describes
 855  *          the length of the area that can be written to.
 856  *
 857  *  -errno: in error cases
 858  */
 859 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 860     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 861 {
 862     BDRVQcowState *s = bs->opaque;
 863     int l2_index;
 864     uint64_t cluster_offset;
 865     uint64_t *l2_table;
 866     unsigned int nb_clusters;
 867     unsigned int keep_clusters;
 868     int ret, pret;
 869
 870     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
 871                               *bytes);
 872
 873     assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
 874                                 == offset_into_cluster(s, *host_offset));
 875
 876     /*
 877      * Calculate the number of clusters to look for. We stop at L2 table
 878      * boundaries to keep things simple.
 879      */
 880     nb_clusters =
 881         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
 882
 883     l2_index = offset_to_l2_index(s, guest_offset);
 884     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
 885
 886     /* Find L2 entry for the first involved cluster */
 887     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
 888     if (ret < 0) {
 889         return ret;
 890     }
 891
 892     cluster_offset = be64_to_cpu(l2_table[l2_index]);
 893
 894     /* Check how many clusters are already allocated and don't need COW */
 895     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
 896         && (cluster_offset & QCOW_OFLAG_COPIED))
 897     {
 898         /* If a specific host_offset is required, check it */
 899         bool offset_matches =
 900             (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
 901
 902         if (*host_offset != 0 && !offset_matches) {
 903             *bytes = 0;
 904             ret = 0;
 905             goto out;
 906         }
 907
 908         /* We keep all QCOW_OFLAG_COPIED clusters */
 909         keep_clusters =
 910             count_contiguous_clusters(nb_clusters, s->cluster_size,
 911                                       &l2_table[l2_index], 0,
 912                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
 913         assert(keep_clusters <= nb_clusters);
 914
 915         *bytes = MIN(*bytes,
 916                  keep_clusters * s->cluster_size
 917                  - offset_into_cluster(s, guest_offset));
 918
 919         ret = 1;
 920     } else {
 921         ret = 0;
 922     }
 923
 924     /* Cleanup */
 925 out:
 926     pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 927     if (pret < 0) {
 928         return pret;
 929     }
 930
 931     /* Only return a host offset if we actually made progress. Otherwise we
 932      * would make requirements for handle_alloc() that it can't fulfill */
 933     if (ret) {
 934         *host_offset = (cluster_offset & L2E_OFFSET_MASK)
 935                      + offset_into_cluster(s, guest_offset);
 936     }
 937
 938     return ret;
 939 }
 940
 941 /*
 942  * Allocates new clusters for the given guest_offset.
 943  *
 944  * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
 945  * contain the number of clusters that have been allocated and are contiguous
 946  * in the image file.
 947  *
 948  * If *host_offset is non-zero, it specifies the offset in the image file at
 949  * which the new clusters must start. *nb_clusters can be 0 on return in this
 950  * case if the cluster at host_offset is already in use. If *host_offset is
 951  * zero, the clusters can be allocated anywhere in the image file.
 952  *
 953  * *host_offset is updated to contain the offset into the image file at which
 954  * the first allocated cluster starts.
 955  *
 956  * Return 0 on success and -errno in error cases. -EAGAIN means that the
 957  * function has been waiting for another request and the allocation must be
 958  * restarted, but the whole request should not be failed.
 959  */
 960 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
 961     uint64_t *host_offset, unsigned int *nb_clusters)
 962 {
 963     BDRVQcowState *s = bs->opaque;
 964
 965     trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
 966                                          *host_offset, *nb_clusters);
 967
 968     /* Allocate new clusters */
 969     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
 970     if (*host_offset == 0) {
 971         int64_t cluster_offset =
 972             qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
 973         if (cluster_offset < 0) {
 974             return cluster_offset;
 975         }
 976         *host_offset = cluster_offset;
 977         return 0;
 978     } else {
 979         int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
 980         if (ret < 0) {
 981             return ret;
 982         }
 983         *nb_clusters = ret;
 984         return 0;
 985     }
 986 }
 987
 988 /*
 989  * Allocates new clusters for an area that either is yet unallocated or needs a
 990  * copy on write. If *host_offset is non-zero, clusters are only allocated if
 991  * the new allocation can match the specified host offset.
 992  *
 993  * Note that guest_offset may not be cluster aligned. In this case, the
 994  * returned *host_offset points to exact byte referenced by guest_offset and
 995  * therefore isn't cluster aligned as well.
 996  *
 997  * Returns:
 998  *   0:     if no clusters could be allocated. *bytes is set to 0,
 999  *          *host_offset is left unchanged.
1000  *
1001  *   1:     if new clusters were allocated. *bytes may be decreased if the
1002  *          new allocation doesn't cover all of the requested area.
1003  *          *host_offset is updated to contain the host offset of the first
1004  *          newly allocated cluster.
1005  *
1006  *  -errno: in error cases
1007  */
1008 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1009     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1010 {
1011     BDRVQcowState *s = bs->opaque;
1012     int l2_index;
1013     uint64_t *l2_table;
1014     uint64_t entry;
1015     unsigned int nb_clusters;
1016     int ret;
1017
1018     uint64_t alloc_cluster_offset;
1019
1020     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
1021                              *bytes);
1022     assert(*bytes > 0);
1023
1024     /*
1025      * Calculate the number of clusters to look for. We stop at L2 table
1026      * boundaries to keep things simple.
1027      */
1028     nb_clusters =
1029         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1030
1031     l2_index = offset_to_l2_index(s, guest_offset);
1032     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1033
1034     /* Find L2 entry for the first involved cluster */
1035     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
1036     if (ret < 0) {
1037         return ret;
1038     }
1039
1040     entry = be64_to_cpu(l2_table[l2_index]);
1041
1042     /* For the moment, overwrite compressed clusters one by one */
1043     if (entry & QCOW_OFLAG_COMPRESSED) {
1044         nb_clusters = 1;
1045     } else {
1046         nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
1047     }
1048
1049     /* This function is only called when there were no non-COW clusters, so if
1050      * we can't find any unallocated or COW clusters either, something is
1051      * wrong with our code. */
1052     assert(nb_clusters > 0);
1053
1054     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1055     if (ret < 0) {
1056         return ret;
1057     }
1058
1059     /* Allocate, if necessary at a given offset in the image file */
1060     alloc_cluster_offset = start_of_cluster(s, *host_offset);
1061     ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
1062                                   &nb_clusters);
1063     if (ret < 0) {
1064         goto fail;
1065     }
1066
1067     /* Can't extend contiguous allocation */
1068     if (nb_clusters == 0) {
1069         *bytes = 0;
1070         return 0;
1071     }
1072
1073     /*
1074      * Save info needed for meta data update.
1075      *
1076      * requested_sectors: Number of sectors from the start of the first
1077      * newly allocated cluster to the end of the (possibly shortened
1078      * before) write request.
1079      *
1080      * avail_sectors: Number of sectors from the start of the first
1081      * newly allocated to the end of the last newly allocated cluster.
1082      *
1083      * nb_sectors: The number of sectors from the start of the first
1084      * newly allocated cluster to the end of the area that the write
1085      * request actually writes to (excluding COW at the end)
1086      */
1087     int requested_sectors =
1088         (*bytes + offset_into_cluster(s, guest_offset))
1089         >> BDRV_SECTOR_BITS;
1090     int avail_sectors = nb_clusters
1091                         << (s->cluster_bits - BDRV_SECTOR_BITS);
1092     int alloc_n_start = offset_into_cluster(s, guest_offset)
1093                         >> BDRV_SECTOR_BITS;
1094     int nb_sectors = MIN(requested_sectors, avail_sectors);
1095     QCowL2Meta *old_m = *m;
1096
1097     *m = g_malloc0(sizeof(**m));
1098
1099     **m = (QCowL2Meta) {
1100         .next           = old_m,
1101
1102         .alloc_offset   = alloc_cluster_offset,
1103         .offset         = start_of_cluster(s, guest_offset),
1104         .nb_clusters    = nb_clusters,
1105         .nb_available   = nb_sectors,
1106
1107         .cow_start = {
1108             .offset     = 0,
1109             .nb_sectors = alloc_n_start,
1110         },
1111         .cow_end = {
1112             .offset     = nb_sectors * BDRV_SECTOR_SIZE,
1113             .nb_sectors = avail_sectors - nb_sectors,
1114         },
1115     };
1116     qemu_co_queue_init(&(*m)->dependent_requests);
1117     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1118
1119     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
1120     *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
1121                          - offset_into_cluster(s, guest_offset));
1122     assert(*bytes != 0);
1123
1124     return 1;
1125
1126 fail:
1127     if (*m && (*m)->nb_clusters > 0) {
1128         QLIST_REMOVE(*m, next_in_flight);
1129     }
1130     return ret;
1131 }
1132
1133 /*
1134  * alloc_cluster_offset
1135  *
1136  * For a given offset on the virtual disk, find the cluster offset in qcow2
1137  * file. If the offset is not found, allocate a new cluster.
1138  *
1139  * If the cluster was already allocated, m->nb_clusters is set to 0 and
1140  * other fields in m are meaningless.
1141  *
1142  * If the cluster is newly allocated, m->nb_clusters is set to the number of
1143  * contiguous clusters that have been allocated. In this case, the other
1144  * fields of m are valid and contain information about the first allocated
1145  * cluster.
1146  *
1147  * If the request conflicts with another write request in flight, the coroutine
1148  * is queued and will be reentered when the dependency has completed.
1149  *
1150  * Return 0 on success and -errno in error cases
1151  */
1152 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1153     int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
1154 {
1155     BDRVQcowState *s = bs->opaque;
1156     uint64_t start, remaining;
1157     uint64_t cluster_offset;
1158     uint64_t cur_bytes;
1159     int ret;
1160
1161     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
1162                                       n_start, n_end);
1163
1164     assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
1165     offset = start_of_cluster(s, offset);
1166
1167 again:
1168     start = offset + (n_start << BDRV_SECTOR_BITS);
1169     remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
1170     cluster_offset = 0;
1171     *host_offset = 0;
1172     cur_bytes = 0;
1173     *m = NULL;
1174
1175     while (true) {
1176
1177         if (!*host_offset) {
1178             *host_offset = start_of_cluster(s, cluster_offset);
1179         }
1180
1181         assert(remaining >= cur_bytes);
1182
1183         start           += cur_bytes;
1184         remaining       -= cur_bytes;
1185         cluster_offset  += cur_bytes;
1186
1187         if (remaining == 0) {
1188             break;
1189         }
1190
1191         cur_bytes = remaining;
1192
1193         /*
1194          * Now start gathering as many contiguous clusters as possible:
1195          *
1196          * 1. Check for overlaps with in-flight allocations
1197          *
1198          *      a) Overlap not in the first cluster -> shorten this request and
1199          *         let the caller handle the rest in its next loop iteration.
1200          *
1201          *      b) Real overlaps of two requests. Yield and restart the search
1202          *         for contiguous clusters (the situation could have changed
1203          *         while we were sleeping)
1204          *
1205          *      c) TODO: Request starts in the same cluster as the in-flight
1206          *         allocation ends. Shorten the COW of the in-fight allocation,
1207          *         set cluster_offset to write to the same cluster and set up
1208          *         the right synchronisation between the in-flight request and
1209          *         the new one.
1210          */
1211         ret = handle_dependencies(bs, start, &cur_bytes, m);
1212         if (ret == -EAGAIN) {
1213             /* Currently handle_dependencies() doesn't yield if we already had
1214              * an allocation. If it did, we would have to clean up the L2Meta
1215              * structs before starting over. */
1216             assert(*m == NULL);
1217             goto again;
1218         } else if (ret < 0) {
1219             return ret;
1220         } else if (cur_bytes == 0) {
1221             break;
1222         } else {
1223             /* handle_dependencies() may have decreased cur_bytes (shortened
1224              * the allocations below) so that the next dependency is processed
1225              * correctly during the next loop iteration. */
1226         }
1227
1228         /*
1229          * 2. Count contiguous COPIED clusters.
1230          */
1231         ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
1232         if (ret < 0) {
1233             return ret;
1234         } else if (ret) {
1235             continue;
1236         } else if (cur_bytes == 0) {
1237             break;
1238         }
1239
1240         /*
1241          * 3. If the request still hasn't completed, allocate new clusters,
1242          *    considering any cluster_offset of steps 1c or 2.
1243          */
1244         ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1245         if (ret < 0) {
1246             return ret;
1247         } else if (ret) {
1248             continue;
1249         } else {
1250             assert(cur_bytes == 0);
1251             break;
1252         }
1253     }
1254
1255     *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
1256     assert(*num > 0);
1257     assert(*host_offset != 0);
1258
1259     return 0;
1260 }
1261
1262 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1263                              const uint8_t *buf, int buf_size)
1264 {
1265     z_stream strm1, *strm = &strm1;
1266     int ret, out_len;
1267
1268     memset(strm, 0, sizeof(*strm));
1269
1270     strm->next_in = (uint8_t *)buf;
1271     strm->avail_in = buf_size;
1272     strm->next_out = out_buf;
1273     strm->avail_out = out_buf_size;
1274
1275     ret = inflateInit2(strm, -12);
1276     if (ret != Z_OK)
1277         return -1;
1278     ret = inflate(strm, Z_FINISH);
1279     out_len = strm->next_out - out_buf;
1280     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1281         out_len != out_buf_size) {
1282         inflateEnd(strm);
1283         return -1;
1284     }
1285     inflateEnd(strm);
1286     return 0;
1287 }
1288
1289 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
1290 {
1291     BDRVQcowState *s = bs->opaque;
1292     int ret, csize, nb_csectors, sector_offset;
1293     uint64_t coffset;
1294
1295     coffset = cluster_offset & s->cluster_offset_mask;
1296     if (s->cluster_cache_offset != coffset) {
1297         nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1298         sector_offset = coffset & 511;
1299         csize = nb_csectors * 512 - sector_offset;
1300         BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
1301         ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
1302         if (ret < 0) {
1303             return ret;
1304         }
1305         if (decompress_buffer(s->cluster_cache, s->cluster_size,
1306                               s->cluster_data + sector_offset, csize) < 0) {
1307             return -EIO;
1308         }
1309         s->cluster_cache_offset = coffset;
1310     }
1311     return 0;
1312 }
1313
1314 /*
1315  * This discards as many clusters of nb_clusters as possible at once (i.e.
1316  * all clusters in the same L2 table) and returns the number of discarded
1317  * clusters.
1318  */
1319 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
1320     unsigned int nb_clusters)
1321 {
1322     BDRVQcowState *s = bs->opaque;
1323     uint64_t *l2_table;
1324     int l2_index;
1325     int ret;
1326     int i;
1327
1328     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1329     if (ret < 0) {
1330         return ret;
1331     }
1332
1333     /* Limit nb_clusters to one L2 table */
1334     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1335
1336     for (i = 0; i < nb_clusters; i++) {
1337         uint64_t old_offset;
1338
1339         old_offset = be64_to_cpu(l2_table[l2_index + i]);
1340         if ((old_offset & L2E_OFFSET_MASK) == 0) {
1341             continue;
1342         }
1343
1344         /* First remove L2 entries */
1345         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
1346         l2_table[l2_index + i] = cpu_to_be64(0);
1347
1348         /* Then decrease the refcount */
1349         qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1350     }
1351
1352     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1353     if (ret < 0) {
1354         return ret;
1355     }
1356
1357     return nb_clusters;
1358 }
1359
1360 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
1361     int nb_sectors)
1362 {
1363     BDRVQcowState *s = bs->opaque;
1364     uint64_t end_offset;
1365     unsigned int nb_clusters;
1366     int ret;
1367
1368     end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
1369
1370     /* Round start up and end down */
1371     offset = align_offset(offset, s->cluster_size);
1372     end_offset &= ~(s->cluster_size - 1);
1373
1374     if (offset > end_offset) {
1375         return 0;
1376     }
1377
1378     nb_clusters = size_to_clusters(s, end_offset - offset);
1379
1380     s->cache_discards = true;
1381
1382     /* Each L2 table is handled by its own loop iteration */
1383     while (nb_clusters > 0) {
1384         ret = discard_single_l2(bs, offset, nb_clusters);
1385         if (ret < 0) {
1386             goto fail;
1387         }
1388
1389         nb_clusters -= ret;
1390         offset += (ret * s->cluster_size);
1391     }
1392
1393     ret = 0;
1394 fail:
1395     s->cache_discards = false;
1396     qcow2_process_discards(bs, ret);
1397
1398     return ret;
1399 }
1400
1401 /*
1402  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
1403  * all clusters in the same L2 table) and returns the number of zeroed
1404  * clusters.
1405  */
1406 static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
1407     unsigned int nb_clusters)
1408 {
1409     BDRVQcowState *s = bs->opaque;
1410     uint64_t *l2_table;
1411     int l2_index;
1412     int ret;
1413     int i;
1414
1415     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1416     if (ret < 0) {
1417         return ret;
1418     }
1419
1420     /* Limit nb_clusters to one L2 table */
1421     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1422
1423     for (i = 0; i < nb_clusters; i++) {
1424         uint64_t old_offset;
1425
1426         old_offset = be64_to_cpu(l2_table[l2_index + i]);
1427
1428         /* Update L2 entries */
1429         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
1430         if (old_offset & QCOW_OFLAG_COMPRESSED) {
1431             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1432             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1433         } else {
1434             l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
1435         }
1436     }
1437
1438     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1439     if (ret < 0) {
1440         return ret;
1441     }
1442
1443     return nb_clusters;
1444 }
1445
1446 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
1447 {
1448     BDRVQcowState *s = bs->opaque;
1449     unsigned int nb_clusters;
1450     int ret;
1451
1452     /* The zero flag is only supported by version 3 and newer */
1453     if (s->qcow_version < 3) {
1454         return -ENOTSUP;
1455     }
1456
1457     /* Each L2 table is handled by its own loop iteration */
1458     nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
1459
1460     s->cache_discards = true;
1461
1462     while (nb_clusters > 0) {
1463         ret = zero_single_l2(bs, offset, nb_clusters);
1464         if (ret < 0) {
1465             goto fail;
1466         }
1467
1468         nb_clusters -= ret;
1469         offset += (ret * s->cluster_size);
1470     }
1471
1472     ret = 0;
1473 fail:
1474     s->cache_discards = false;
1475     qcow2_process_discards(bs, ret);
1476
1477     return ret;
1478 }