2 * Copyright (c) 2008 Jakub Jermar
3 * Copyright (c) 2008 Martin Decky
4 * Copyright (c) 2011 Martin Sucha
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * - The name of the author may not be used to endorse or promote products
17 * derived from this software without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 /** @addtogroup libblock
40 #include <ipc/services.h>
46 #include <fibril_synch.h>
48 #include <adt/hash_table.h>
53 #include <sys/typefmt.h>
54 #include <stacktrace.h>
57 #define MAX_WRITE_RETRIES 10
59 /** Lock protecting the device connection list */
60 static FIBRIL_MUTEX_INITIALIZE(dcl_lock
);
61 /** Device connection list head. */
62 static LIST_INITIALIZE(dcl
);
67 size_t lblock_size
; /**< Logical block size. */
68 unsigned blocks_cluster
; /**< Physical blocks per block_t */
69 unsigned block_count
; /**< Total number of blocks. */
70 unsigned blocks_cached
; /**< Number of cached blocks. */
71 hash_table_t block_hash
;
78 service_id_t service_id
;
83 aoff64_t pblocks
; /**< Number of physical blocks */
84 size_t pblock_size
; /**< Physical block size. */
88 static int read_blocks(devcon_t
*, aoff64_t
, size_t, void *, size_t);
89 static int write_blocks(devcon_t
*, aoff64_t
, size_t, void *, size_t);
90 static aoff64_t
ba_ltop(devcon_t
*, aoff64_t
);
92 static devcon_t
*devcon_search(service_id_t service_id
)
94 fibril_mutex_lock(&dcl_lock
);
96 list_foreach(dcl
, link
, devcon_t
, devcon
) {
97 if (devcon
->service_id
== service_id
) {
98 fibril_mutex_unlock(&dcl_lock
);
103 fibril_mutex_unlock(&dcl_lock
);
107 static int devcon_add(service_id_t service_id
, async_sess_t
*sess
,
108 size_t bsize
, aoff64_t dev_size
, bd_t
*bd
)
112 devcon
= malloc(sizeof(devcon_t
));
116 link_initialize(&devcon
->link
);
117 devcon
->service_id
= service_id
;
120 devcon
->bb_buf
= NULL
;
122 devcon
->pblock_size
= bsize
;
123 devcon
->pblocks
= dev_size
;
124 devcon
->cache
= NULL
;
126 fibril_mutex_lock(&dcl_lock
);
127 list_foreach(dcl
, link
, devcon_t
, d
) {
128 if (d
->service_id
== service_id
) {
129 fibril_mutex_unlock(&dcl_lock
);
134 list_append(&devcon
->link
, &dcl
);
135 fibril_mutex_unlock(&dcl_lock
);
139 static void devcon_remove(devcon_t
*devcon
)
141 fibril_mutex_lock(&dcl_lock
);
142 list_remove(&devcon
->link
);
143 fibril_mutex_unlock(&dcl_lock
);
146 int block_init(exch_mgmt_t mgmt
, service_id_t service_id
,
151 async_sess_t
*sess
= loc_service_connect(mgmt
, service_id
,
157 int rc
= bd_open(sess
, &bd
);
164 rc
= bd_get_block_size(bd
, &bsize
);
172 rc
= bd_get_num_blocks(bd
, &dev_size
);
179 rc
= devcon_add(service_id
, sess
, bsize
, dev_size
, bd
);
189 void block_fini(service_id_t service_id
)
191 devcon_t
*devcon
= devcon_search(service_id
);
195 (void) block_cache_fini(service_id
);
197 devcon_remove(devcon
);
200 free(devcon
->bb_buf
);
202 bd_close(devcon
->bd
);
203 async_hangup(devcon
->sess
);
208 int block_bb_read(service_id_t service_id
, aoff64_t ba
)
213 devcon_t
*devcon
= devcon_search(service_id
);
218 bb_buf
= malloc(devcon
->pblock_size
);
222 rc
= read_blocks(devcon
, 0, 1, bb_buf
, devcon
->pblock_size
);
228 devcon
->bb_buf
= bb_buf
;
229 devcon
->bb_addr
= ba
;
234 void *block_bb_get(service_id_t service_id
)
236 devcon_t
*devcon
= devcon_search(service_id
);
238 return devcon
->bb_buf
;
241 static size_t cache_key_hash(void *key
)
243 aoff64_t
*lba
= (aoff64_t
*)key
;
247 static size_t cache_hash(const ht_link_t
*item
)
249 block_t
*b
= hash_table_get_inst(item
, block_t
, hash_link
);
253 static bool cache_key_equal(void *key
, const ht_link_t
*item
)
255 aoff64_t
*lba
= (aoff64_t
*)key
;
256 block_t
*b
= hash_table_get_inst(item
, block_t
, hash_link
);
257 return b
->lba
== *lba
;
261 static hash_table_ops_t cache_ops
= {
263 .key_hash
= cache_key_hash
,
264 .key_equal
= cache_key_equal
,
266 .remove_callback
= NULL
269 int block_cache_init(service_id_t service_id
, size_t size
, unsigned blocks
,
270 enum cache_mode mode
)
272 devcon_t
*devcon
= devcon_search(service_id
);
278 cache
= malloc(sizeof(cache_t
));
282 fibril_mutex_initialize(&cache
->lock
);
283 list_initialize(&cache
->free_list
);
284 cache
->lblock_size
= size
;
285 cache
->block_count
= blocks
;
286 cache
->blocks_cached
= 0;
289 /* Allow 1:1 or small-to-large block size translation */
290 if (cache
->lblock_size
% devcon
->pblock_size
!= 0) {
295 cache
->blocks_cluster
= cache
->lblock_size
/ devcon
->pblock_size
;
297 if (!hash_table_create(&cache
->block_hash
, 0, 0, &cache_ops
)) {
302 devcon
->cache
= cache
;
306 int block_cache_fini(service_id_t service_id
)
308 devcon_t
*devcon
= devcon_search(service_id
);
316 cache
= devcon
->cache
;
319 * We are expecting to find all blocks for this device handle on the
320 * free list, i.e. the block reference count should be zero. Do not
321 * bother with the cache and block locks because we are single-threaded.
323 while (!list_empty(&cache
->free_list
)) {
324 block_t
*b
= list_get_instance(list_first(&cache
->free_list
),
327 list_remove(&b
->free_link
);
329 rc
= write_blocks(devcon
, b
->pba
, cache
->blocks_cluster
,
335 hash_table_remove_item(&cache
->block_hash
, &b
->hash_link
);
341 hash_table_destroy(&cache
->block_hash
);
342 devcon
->cache
= NULL
;
348 #define CACHE_LO_WATERMARK 10
349 #define CACHE_HI_WATERMARK 20
350 static bool cache_can_grow(cache_t
*cache
)
352 if (cache
->blocks_cached
< CACHE_LO_WATERMARK
)
354 if (!list_empty(&cache
->free_list
))
359 static void block_initialize(block_t
*b
)
361 fibril_mutex_initialize(&b
->lock
);
363 b
->write_failures
= 0;
366 fibril_rwlock_initialize(&b
->contents_lock
);
367 link_initialize(&b
->free_link
);
370 /** Instantiate a block in memory and get a reference to it.
372 * @param block Pointer to where the function will store the
373 * block pointer on success.
374 * @param service_id Service ID of the block device.
375 * @param ba Block address (logical).
376 * @param flags If BLOCK_FLAGS_NOREAD is specified, block_get()
377 * will not read the contents of the block from the
380 * @return EOK on success or a negative error code.
382 int block_get(block_t
**block
, service_id_t service_id
, aoff64_t ba
, int flags
)
391 devcon
= devcon_search(service_id
);
394 assert(devcon
->cache
);
396 cache
= devcon
->cache
;
398 /* Check whether the logical block (or part of it) is beyond
399 * the end of the device or not.
401 p_ba
= ba_ltop(devcon
, ba
);
402 p_ba
+= cache
->blocks_cluster
;
403 if (p_ba
>= devcon
->pblocks
) {
404 /* This request cannot be satisfied */
413 fibril_mutex_lock(&cache
->lock
);
414 ht_link_t
*hlink
= hash_table_find(&cache
->block_hash
, &ba
);
418 * We found the block in the cache.
420 b
= hash_table_get_inst(hlink
, block_t
, hash_link
);
421 fibril_mutex_lock(&b
->lock
);
422 if (b
->refcnt
++ == 0)
423 list_remove(&b
->free_link
);
426 fibril_mutex_unlock(&b
->lock
);
427 fibril_mutex_unlock(&cache
->lock
);
430 * The block was not found in the cache.
432 if (cache_can_grow(cache
)) {
434 * We can grow the cache by allocating new blocks.
435 * Should the allocation fail, we fail over and try to
436 * recycle a block from the cache.
438 b
= malloc(sizeof(block_t
));
441 b
->data
= malloc(cache
->lblock_size
);
447 cache
->blocks_cached
++;
450 * Try to recycle a block from the free list.
453 if (list_empty(&cache
->free_list
)) {
454 fibril_mutex_unlock(&cache
->lock
);
458 link
= list_first(&cache
->free_list
);
459 b
= list_get_instance(link
, block_t
, free_link
);
461 fibril_mutex_lock(&b
->lock
);
464 * The block needs to be written back to the
465 * device before it changes identity. Do this
466 * while not holding the cache lock so that
467 * concurrency is not impeded. Also move the
468 * block to the end of the free list so that we
469 * do not slow down other instances of
470 * block_get() draining the free list.
472 list_remove(&b
->free_link
);
473 list_append(&b
->free_link
, &cache
->free_list
);
474 fibril_mutex_unlock(&cache
->lock
);
475 rc
= write_blocks(devcon
, b
->pba
,
476 cache
->blocks_cluster
, b
->data
, b
->size
);
479 * We did not manage to write the block
480 * to the device. Keep it around for
481 * another try. Hopefully, we will grab
482 * another block next time.
484 if (b
->write_failures
< MAX_WRITE_RETRIES
) {
486 fibril_mutex_unlock(&b
->lock
);
489 printf("Too many errors writing block %"
490 PRIuOFF64
"from device handle %" PRIun
"\n"
491 "SEVERE DATA LOSS POSSIBLE\n",
492 b
->lba
, devcon
->service_id
);
495 b
->write_failures
= 0;
498 if (!fibril_mutex_trylock(&cache
->lock
)) {
500 * Somebody is probably racing with us.
501 * Unlock the block and retry.
503 fibril_mutex_unlock(&b
->lock
);
506 hlink
= hash_table_find(&cache
->block_hash
, &ba
);
509 * Someone else must have already
510 * instantiated the block while we were
511 * not holding the cache lock.
512 * Leave the recycled block on the
513 * freelist and continue as if we
514 * found the block of interest during
517 fibril_mutex_unlock(&b
->lock
);
522 fibril_mutex_unlock(&b
->lock
);
525 * Unlink the block from the free list and the hash
528 list_remove(&b
->free_link
);
529 hash_table_remove_item(&cache
->block_hash
, &b
->hash_link
);
533 b
->service_id
= service_id
;
534 b
->size
= cache
->lblock_size
;
536 b
->pba
= ba_ltop(devcon
, b
->lba
);
537 hash_table_insert(&cache
->block_hash
, &b
->hash_link
);
540 * Lock the block before releasing the cache lock. Thus we don't
541 * kill concurrent operations on the cache while doing I/O on
544 fibril_mutex_lock(&b
->lock
);
545 fibril_mutex_unlock(&cache
->lock
);
547 if (!(flags
& BLOCK_FLAGS_NOREAD
)) {
549 * The block contains old or no data. We need to read
550 * the new contents from the device.
552 rc
= read_blocks(devcon
, b
->pba
, cache
->blocks_cluster
,
553 b
->data
, cache
->lblock_size
);
559 fibril_mutex_unlock(&b
->lock
);
562 if ((rc
!= EOK
) && b
) {
571 /** Release a reference to a block.
573 * If the last reference is dropped, the block is put on the free list.
575 * @param block Block of which a reference is to be released.
577 * @return EOK on success or a negative error code.
579 int block_put(block_t
*block
)
581 devcon_t
*devcon
= devcon_search(block
->service_id
);
583 unsigned blocks_cached
;
584 enum cache_mode mode
;
588 assert(devcon
->cache
);
589 assert(block
->refcnt
>= 1);
591 cache
= devcon
->cache
;
594 fibril_mutex_lock(&cache
->lock
);
595 blocks_cached
= cache
->blocks_cached
;
597 fibril_mutex_unlock(&cache
->lock
);
600 * Determine whether to sync the block. Syncing the block is best done
601 * when not holding the cache lock as it does not impede concurrency.
602 * Since the situation may have changed when we unlocked the cache, the
603 * blocks_cached and mode variables are mere hints. We will recheck the
604 * conditions later when the cache lock is held again.
606 fibril_mutex_lock(&block
->lock
);
608 block
->dirty
= false; /* will not write back toxic block */
609 if (block
->dirty
&& (block
->refcnt
== 1) &&
610 (blocks_cached
> CACHE_HI_WATERMARK
|| mode
!= CACHE_MODE_WB
)) {
611 rc
= write_blocks(devcon
, block
->pba
, cache
->blocks_cluster
,
612 block
->data
, block
->size
);
614 block
->write_failures
= 0;
615 block
->dirty
= false;
617 fibril_mutex_unlock(&block
->lock
);
619 fibril_mutex_lock(&cache
->lock
);
620 fibril_mutex_lock(&block
->lock
);
621 if (!--block
->refcnt
) {
623 * Last reference to the block was dropped. Either free the
624 * block or put it on the free list. In case of an I/O error,
627 if ((cache
->blocks_cached
> CACHE_HI_WATERMARK
) ||
630 * Currently there are too many cached blocks or there
631 * was an I/O error when writing the block back to the
636 * We cannot sync the block while holding the
637 * cache lock. Release everything and retry.
641 if (block
->write_failures
< MAX_WRITE_RETRIES
) {
642 block
->write_failures
++;
643 fibril_mutex_unlock(&block
->lock
);
644 fibril_mutex_unlock(&cache
->lock
);
647 printf("Too many errors writing block %"
648 PRIuOFF64
"from device handle %" PRIun
"\n"
649 "SEVERE DATA LOSS POSSIBLE\n",
650 block
->lba
, devcon
->service_id
);
654 * Take the block out of the cache and free it.
656 hash_table_remove_item(&cache
->block_hash
, &block
->hash_link
);
657 fibril_mutex_unlock(&block
->lock
);
660 cache
->blocks_cached
--;
661 fibril_mutex_unlock(&cache
->lock
);
665 * Put the block on the free list.
667 if (cache
->mode
!= CACHE_MODE_WB
&& block
->dirty
) {
669 * We cannot sync the block while holding the cache
670 * lock. Release everything and retry.
673 fibril_mutex_unlock(&block
->lock
);
674 fibril_mutex_unlock(&cache
->lock
);
677 list_append(&block
->free_link
, &cache
->free_list
);
679 fibril_mutex_unlock(&block
->lock
);
680 fibril_mutex_unlock(&cache
->lock
);
685 /** Read sequential data from a block device.
687 * @param service_id Service ID of the block device.
688 * @param buf Buffer for holding one block
689 * @param bufpos Pointer to the first unread valid offset within the
690 * communication buffer.
691 * @param buflen Pointer to the number of unread bytes that are ready in
692 * the communication buffer.
693 * @param pos Device position to be read.
694 * @param dst Destination buffer.
695 * @param size Size of the destination buffer.
696 * @param block_size Block size to be used for the transfer.
698 * @return EOK on success or a negative return code on failure.
700 int block_seqread(service_id_t service_id
, void *buf
, size_t *bufpos
,
701 size_t *buflen
, aoff64_t
*pos
, void *dst
, size_t size
)
708 devcon
= devcon_search(service_id
);
710 block_size
= devcon
->pblock_size
;
715 if (*bufpos
+ left
< *buflen
)
718 rd
= *buflen
- *bufpos
;
722 * Copy the contents of the communication buffer to the
723 * destination buffer.
725 memcpy(dst
+ offset
, buf
+ *bufpos
, rd
);
732 if (*bufpos
== *buflen
) {
733 /* Refill the communication buffer with a new block. */
736 rc
= read_blocks(devcon
, *pos
/ block_size
, 1, buf
,
737 devcon
->pblock_size
);
743 *buflen
= block_size
;
750 /** Read blocks directly from device (bypass cache).
752 * @param service_id Service ID of the block device.
753 * @param ba Address of first block (physical).
754 * @param cnt Number of blocks.
755 * @param src Buffer for storing the data.
757 * @return EOK on success or negative error code on failure.
759 int block_read_direct(service_id_t service_id
, aoff64_t ba
, size_t cnt
, void *buf
)
763 devcon
= devcon_search(service_id
);
766 return read_blocks(devcon
, ba
, cnt
, buf
, devcon
->pblock_size
* cnt
);
769 /** Write blocks directly to device (bypass cache).
771 * @param service_id Service ID of the block device.
772 * @param ba Address of first block (physical).
773 * @param cnt Number of blocks.
774 * @param src The data to be written.
776 * @return EOK on success or negative error code on failure.
778 int block_write_direct(service_id_t service_id
, aoff64_t ba
, size_t cnt
,
783 devcon
= devcon_search(service_id
);
786 return write_blocks(devcon
, ba
, cnt
, (void *)data
, devcon
->pblock_size
* cnt
);
789 /** Get device block size.
791 * @param service_id Service ID of the block device.
792 * @param bsize Output block size.
794 * @return EOK on success or negative error code on failure.
796 int block_get_bsize(service_id_t service_id
, size_t *bsize
)
800 devcon
= devcon_search(service_id
);
803 return bd_get_block_size(devcon
->bd
, bsize
);
806 /** Get number of blocks on device.
808 * @param service_id Service ID of the block device.
809 * @param nblocks Output number of blocks.
811 * @return EOK on success or negative error code on failure.
813 int block_get_nblocks(service_id_t service_id
, aoff64_t
*nblocks
)
815 devcon_t
*devcon
= devcon_search(service_id
);
818 return bd_get_num_blocks(devcon
->bd
, nblocks
);
821 /** Read bytes directly from the device (bypass cache)
823 * @param service_id Service ID of the block device.
824 * @param abs_offset Absolute offset in bytes where to start reading
825 * @param bytes Number of bytes to read
826 * @param data Buffer that receives the data
828 * @return EOK on success or negative error code on failure.
830 int block_read_bytes_direct(service_id_t service_id
, aoff64_t abs_offset
,
831 size_t bytes
, void *data
)
834 size_t phys_block_size
;
837 aoff64_t first_block
;
842 rc
= block_get_bsize(service_id
, &phys_block_size
);
847 /* calculate data position and required space */
848 first_block
= abs_offset
/ phys_block_size
;
849 offset
= abs_offset
% phys_block_size
;
850 last_block
= (abs_offset
+ bytes
- 1) / phys_block_size
;
851 blocks
= last_block
- first_block
+ 1;
852 buf_size
= blocks
* phys_block_size
;
854 /* read the data into memory */
855 buffer
= malloc(buf_size
);
856 if (buffer
== NULL
) {
860 rc
= block_read_direct(service_id
, first_block
, blocks
, buffer
);
866 /* copy the data from the buffer */
867 memcpy(data
, buffer
+ offset
, bytes
);
873 /** Get TOC from device.
875 * @param service_id Service ID of the block device.
876 * @param session Starting session.
878 * @return Allocated TOC structure.
879 * @return NULL on failure.
882 toc_block_t
*block_get_toc(service_id_t service_id
, uint8_t session
)
884 devcon_t
*devcon
= devcon_search(service_id
);
885 toc_block_t
*toc
= NULL
;
890 toc
= (toc_block_t
*) malloc(sizeof(toc_block_t
));
894 rc
= bd_read_toc(devcon
->bd
, session
, toc
, sizeof(toc_block_t
));
903 /** Read blocks from block device.
905 * @param devcon Device connection.
906 * @param ba Address of first block.
907 * @param cnt Number of blocks.
908 * @param src Buffer for storing the data.
910 * @return EOK on success or negative error code on failure.
912 static int read_blocks(devcon_t
*devcon
, aoff64_t ba
, size_t cnt
, void *buf
,
917 int rc
= bd_read_blocks(devcon
->bd
, ba
, cnt
, buf
, size
);
919 printf("Error %d reading %zu blocks starting at block %" PRIuOFF64
920 " from device handle %" PRIun
"\n", rc
, cnt
, ba
,
930 /** Write block to block device.
932 * @param devcon Device connection.
933 * @param ba Address of first block.
934 * @param cnt Number of blocks.
935 * @param src Buffer containing the data to write.
937 * @return EOK on success or negative error code on failure.
939 static int write_blocks(devcon_t
*devcon
, aoff64_t ba
, size_t cnt
, void *data
,
944 int rc
= bd_write_blocks(devcon
->bd
, ba
, cnt
, data
, size
);
946 printf("Error %d writing %zu blocks starting at block %" PRIuOFF64
947 " to device handle %" PRIun
"\n", rc
, cnt
, ba
, devcon
->service_id
);
956 /** Convert logical block address to physical block address. */
957 static aoff64_t
ba_ltop(devcon_t
*devcon
, aoff64_t lba
)
959 assert(devcon
->cache
!= NULL
);
960 return lba
* devcon
->cache
->blocks_cluster
;