2 * L2/refcount table cache for the QCOW2 format
4 * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "block/block_int.h"
27 #include "qemu-common.h"
31 typedef struct Qcow2CachedTable
{
39 Qcow2CachedTable
*entries
;
40 struct Qcow2Cache
*depends
;
42 bool depends_on_flush
;
45 uint64_t cache_clean_lru_counter
;
48 static inline void *qcow2_cache_get_table_addr(BlockDriverState
*bs
,
49 Qcow2Cache
*c
, int table
)
51 BDRVQcow2State
*s
= bs
->opaque
;
52 return (uint8_t *) c
->table_array
+ (size_t) table
* s
->cluster_size
;
55 static inline int qcow2_cache_get_table_idx(BlockDriverState
*bs
,
56 Qcow2Cache
*c
, void *table
)
58 BDRVQcow2State
*s
= bs
->opaque
;
59 ptrdiff_t table_offset
= (uint8_t *) table
- (uint8_t *) c
->table_array
;
60 int idx
= table_offset
/ s
->cluster_size
;
61 assert(idx
>= 0 && idx
< c
->size
&& table_offset
% s
->cluster_size
== 0);
65 static inline const char *qcow2_cache_get_name(BDRVQcow2State
*s
, Qcow2Cache
*c
)
67 if (c
== s
->refcount_block_cache
) {
68 return "refcount block";
69 } else if (c
== s
->l2_table_cache
) {
72 /* Do not abort, because this is not critical */
77 static void qcow2_cache_table_release(BlockDriverState
*bs
, Qcow2Cache
*c
,
78 int i
, int num_tables
)
80 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
82 BDRVQcow2State
*s
= bs
->opaque
;
83 void *t
= qcow2_cache_get_table_addr(bs
, c
, i
);
84 int align
= getpagesize();
85 size_t mem_size
= (size_t) s
->cluster_size
* num_tables
;
86 size_t offset
= QEMU_ALIGN_UP((uintptr_t) t
, align
) - (uintptr_t) t
;
87 size_t length
= QEMU_ALIGN_DOWN(mem_size
- offset
, align
);
88 if (mem_size
> offset
&& length
> 0) {
89 madvise((uint8_t *) t
+ offset
, length
, MADV_DONTNEED
);
94 static inline bool can_clean_entry(Qcow2Cache
*c
, int i
)
96 Qcow2CachedTable
*t
= &c
->entries
[i
];
97 return t
->ref
== 0 && !t
->dirty
&& t
->offset
!= 0 &&
98 t
->lru_counter
<= c
->cache_clean_lru_counter
;
101 void qcow2_cache_clean_unused(BlockDriverState
*bs
, Qcow2Cache
*c
)
104 while (i
< c
->size
) {
107 /* Skip the entries that we don't need to clean */
108 while (i
< c
->size
&& !can_clean_entry(c
, i
)) {
112 /* And count how many we can clean in a row */
113 while (i
< c
->size
&& can_clean_entry(c
, i
)) {
114 c
->entries
[i
].offset
= 0;
115 c
->entries
[i
].lru_counter
= 0;
121 qcow2_cache_table_release(bs
, c
, i
- to_clean
, to_clean
);
125 c
->cache_clean_lru_counter
= c
->lru_counter
;
128 Qcow2Cache
*qcow2_cache_create(BlockDriverState
*bs
, int num_tables
)
130 BDRVQcow2State
*s
= bs
->opaque
;
133 c
= g_new0(Qcow2Cache
, 1);
134 c
->size
= num_tables
;
135 c
->entries
= g_try_new0(Qcow2CachedTable
, num_tables
);
136 c
->table_array
= qemu_try_blockalign(bs
->file
->bs
,
137 (size_t) num_tables
* s
->cluster_size
);
139 if (!c
->entries
|| !c
->table_array
) {
140 qemu_vfree(c
->table_array
);
149 int qcow2_cache_destroy(BlockDriverState
*bs
, Qcow2Cache
*c
)
153 for (i
= 0; i
< c
->size
; i
++) {
154 assert(c
->entries
[i
].ref
== 0);
157 qemu_vfree(c
->table_array
);
164 static int qcow2_cache_flush_dependency(BlockDriverState
*bs
, Qcow2Cache
*c
)
168 ret
= qcow2_cache_flush(bs
, c
->depends
);
174 c
->depends_on_flush
= false;
179 static int qcow2_cache_entry_flush(BlockDriverState
*bs
, Qcow2Cache
*c
, int i
)
181 BDRVQcow2State
*s
= bs
->opaque
;
184 if (!c
->entries
[i
].dirty
|| !c
->entries
[i
].offset
) {
188 trace_qcow2_cache_entry_flush(qemu_coroutine_self(),
189 c
== s
->l2_table_cache
, i
);
192 ret
= qcow2_cache_flush_dependency(bs
, c
);
193 } else if (c
->depends_on_flush
) {
194 ret
= bdrv_flush(bs
->file
->bs
);
196 c
->depends_on_flush
= false;
204 if (c
== s
->refcount_block_cache
) {
205 ret
= qcow2_pre_write_overlap_check(bs
, QCOW2_OL_REFCOUNT_BLOCK
,
206 c
->entries
[i
].offset
, s
->cluster_size
);
207 } else if (c
== s
->l2_table_cache
) {
208 ret
= qcow2_pre_write_overlap_check(bs
, QCOW2_OL_ACTIVE_L2
,
209 c
->entries
[i
].offset
, s
->cluster_size
);
211 ret
= qcow2_pre_write_overlap_check(bs
, 0,
212 c
->entries
[i
].offset
, s
->cluster_size
);
219 if (c
== s
->refcount_block_cache
) {
220 BLKDBG_EVENT(bs
->file
, BLKDBG_REFBLOCK_UPDATE_PART
);
221 } else if (c
== s
->l2_table_cache
) {
222 BLKDBG_EVENT(bs
->file
, BLKDBG_L2_UPDATE
);
225 ret
= bdrv_pwrite(bs
->file
, c
->entries
[i
].offset
,
226 qcow2_cache_get_table_addr(bs
, c
, i
), s
->cluster_size
);
231 c
->entries
[i
].dirty
= false;
236 int qcow2_cache_write(BlockDriverState
*bs
, Qcow2Cache
*c
)
238 BDRVQcow2State
*s
= bs
->opaque
;
243 trace_qcow2_cache_flush(qemu_coroutine_self(), c
== s
->l2_table_cache
);
245 for (i
= 0; i
< c
->size
; i
++) {
246 ret
= qcow2_cache_entry_flush(bs
, c
, i
);
247 if (ret
< 0 && result
!= -ENOSPC
) {
255 int qcow2_cache_flush(BlockDriverState
*bs
, Qcow2Cache
*c
)
257 int result
= qcow2_cache_write(bs
, c
);
260 int ret
= bdrv_flush(bs
->file
->bs
);
269 int qcow2_cache_set_dependency(BlockDriverState
*bs
, Qcow2Cache
*c
,
270 Qcow2Cache
*dependency
)
274 if (dependency
->depends
) {
275 ret
= qcow2_cache_flush_dependency(bs
, dependency
);
281 if (c
->depends
&& (c
->depends
!= dependency
)) {
282 ret
= qcow2_cache_flush_dependency(bs
, c
);
288 c
->depends
= dependency
;
292 void qcow2_cache_depends_on_flush(Qcow2Cache
*c
)
294 c
->depends_on_flush
= true;
297 int qcow2_cache_empty(BlockDriverState
*bs
, Qcow2Cache
*c
)
301 ret
= qcow2_cache_flush(bs
, c
);
306 for (i
= 0; i
< c
->size
; i
++) {
307 assert(c
->entries
[i
].ref
== 0);
308 c
->entries
[i
].offset
= 0;
309 c
->entries
[i
].lru_counter
= 0;
312 qcow2_cache_table_release(bs
, c
, 0, c
->size
);
319 static int qcow2_cache_do_get(BlockDriverState
*bs
, Qcow2Cache
*c
,
320 uint64_t offset
, void **table
, bool read_from_disk
)
322 BDRVQcow2State
*s
= bs
->opaque
;
326 uint64_t min_lru_counter
= UINT64_MAX
;
327 int min_lru_index
= -1;
331 trace_qcow2_cache_get(qemu_coroutine_self(), c
== s
->l2_table_cache
,
332 offset
, read_from_disk
);
334 if (offset_into_cluster(s
, offset
)) {
335 qcow2_signal_corruption(bs
, true, -1, -1, "Cannot get entry from %s "
336 "cache: Offset %#" PRIx64
" is unaligned",
337 qcow2_cache_get_name(s
, c
), offset
);
341 /* Check if the table is already cached */
342 i
= lookup_index
= (offset
/ s
->cluster_size
* 4) % c
->size
;
344 const Qcow2CachedTable
*t
= &c
->entries
[i
];
345 if (t
->offset
== offset
) {
348 if (t
->ref
== 0 && t
->lru_counter
< min_lru_counter
) {
349 min_lru_counter
= t
->lru_counter
;
352 if (++i
== c
->size
) {
355 } while (i
!= lookup_index
);
357 if (min_lru_index
== -1) {
358 /* This can't happen in current synchronous code, but leave the check
359 * here as a reminder for whoever starts using AIO with the cache */
363 /* Cache miss: write a table back and replace it */
365 trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
366 c
== s
->l2_table_cache
, i
);
368 ret
= qcow2_cache_entry_flush(bs
, c
, i
);
373 trace_qcow2_cache_get_read(qemu_coroutine_self(),
374 c
== s
->l2_table_cache
, i
);
375 c
->entries
[i
].offset
= 0;
376 if (read_from_disk
) {
377 if (c
== s
->l2_table_cache
) {
378 BLKDBG_EVENT(bs
->file
, BLKDBG_L2_LOAD
);
381 ret
= bdrv_pread(bs
->file
, offset
,
382 qcow2_cache_get_table_addr(bs
, c
, i
),
389 c
->entries
[i
].offset
= offset
;
391 /* And return the right table */
394 *table
= qcow2_cache_get_table_addr(bs
, c
, i
);
396 trace_qcow2_cache_get_done(qemu_coroutine_self(),
397 c
== s
->l2_table_cache
, i
);
402 int qcow2_cache_get(BlockDriverState
*bs
, Qcow2Cache
*c
, uint64_t offset
,
405 return qcow2_cache_do_get(bs
, c
, offset
, table
, true);
408 int qcow2_cache_get_empty(BlockDriverState
*bs
, Qcow2Cache
*c
, uint64_t offset
,
411 return qcow2_cache_do_get(bs
, c
, offset
, table
, false);
414 void qcow2_cache_put(BlockDriverState
*bs
, Qcow2Cache
*c
, void **table
)
416 int i
= qcow2_cache_get_table_idx(bs
, c
, *table
);
421 if (c
->entries
[i
].ref
== 0) {
422 c
->entries
[i
].lru_counter
= ++c
->lru_counter
;
425 assert(c
->entries
[i
].ref
>= 0);
428 void qcow2_cache_entry_mark_dirty(BlockDriverState
*bs
, Qcow2Cache
*c
,
431 int i
= qcow2_cache_get_table_idx(bs
, c
, table
);
432 assert(c
->entries
[i
].offset
!= 0);
433 c
->entries
[i
].dirty
= true;
436 void *qcow2_cache_is_table_offset(BlockDriverState
*bs
, Qcow2Cache
*c
,
441 for (i
= 0; i
< c
->size
; i
++) {
442 if (c
->entries
[i
].offset
== offset
) {
443 return qcow2_cache_get_table_addr(bs
, c
, i
);
449 void qcow2_cache_discard(BlockDriverState
*bs
, Qcow2Cache
*c
, void *table
)
451 int i
= qcow2_cache_get_table_idx(bs
, c
, table
);
453 assert(c
->entries
[i
].ref
== 0);
455 c
->entries
[i
].offset
= 0;
456 c
->entries
[i
].lru_counter
= 0;
457 c
->entries
[i
].dirty
= false;
459 qcow2_cache_table_release(bs
, c
, i
, 1);