From bb29b5d82d5af72570d0ccbd03b8b8d3f21b4bd5 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 7 Nov 2010 09:29:39 -0800 Subject: [PATCH] HAMMER - Add hammer dedup directive and support * Implements all the logic required to dedup a HAMMER filesystem. * There is one remaining issue and that is the reblocker's propensity to undo de-dup's hard work in certain cases. * Code bounty for hammer_dedup Submitted-by: Ilya Dryomov --- sbin/hammer/Makefile | 4 +- sbin/hammer/cmd_cleanup.c | 34 ++ sbin/hammer/cmd_config.c | 1 + sbin/hammer/cmd_dedup.c | 755 +++++++++++++++++++++++++++++++++++++++ sbin/hammer/hammer.8 | 53 ++- sbin/hammer/hammer.c | 15 + sbin/hammer/hammer.h | 3 +- sys/conf/files | 1 + sys/vfs/hammer/Makefile | 3 +- sys/vfs/hammer/hammer.h | 10 +- sys/vfs/hammer/hammer_blockmap.c | 107 ++++++ sys/vfs/hammer/hammer_cursor.c | 85 ++++- sys/vfs/hammer/hammer_dedup.c | 182 ++++++++++ sys/vfs/hammer/hammer_ioctl.c | 49 +++ sys/vfs/hammer/hammer_ioctl.h | 26 ++ sys/vfs/hammer/hammer_subs.c | 8 +- 16 files changed, 1313 insertions(+), 23 deletions(-) create mode 100644 sbin/hammer/cmd_dedup.c create mode 100644 sys/vfs/hammer/hammer_dedup.c diff --git a/sbin/hammer/Makefile b/sbin/hammer/Makefile index 7ba6a4d4c3..9c4df8ee05 100644 --- a/sbin/hammer/Makefile +++ b/sbin/hammer/Makefile @@ -8,11 +8,11 @@ SRCS= hammer.c ondisk.c blockmap.c cache.c misc.c cycle.c \ cmd_synctid.c cmd_stats.c \ cmd_pseudofs.c cmd_snapshot.c cmd_mirror.c cmd_status.c \ cmd_cleanup.c cmd_info.c cmd_version.c cmd_volume.c \ - cmd_config.c cmd_recover.c + cmd_config.c cmd_recover.c cmd_dedup.c MAN= hammer.8 CFLAGS+= -I${.CURDIR}/../../sys -DALIST_NO_DEBUG -LDADD= -lm -lutil +LDADD= -lm -lutil -lmd .PATH: ${.CURDIR}/../../sys/libkern SRCS+= crc32.c diff --git a/sbin/hammer/cmd_cleanup.c b/sbin/hammer/cmd_cleanup.c index 83426842ef..d0f3a0e38e 100644 --- a/sbin/hammer/cmd_cleanup.c +++ b/sbin/hammer/cmd_cleanup.c @@ -46,6 +46,7 @@ * snapshots 1d 60d (0d 0d for /tmp, /var/tmp, /usr/obj) * prune 1d 5m * rebalance 1d 5m + * #dedup 1d 5m (not enabled by default) * reblock 1d 5m * recopy 30d 10m * @@ -93,6 +94,8 @@ static int cleanup_reblock(const char *path, const char *snapshots_path, int arg1, int arg2); static int cleanup_recopy(const char *path, const char *snapshots_path, int arg1, int arg2); +static int cleanup_dedup(const char *path, const char *snapshots_path, + int arg1, int arg2); static void runcmd(int *resp, const char *ctl, ...); @@ -494,6 +497,17 @@ do_cleanup(const char *path) } else { printf("skip\n"); } + } else if (strcmp(cmd, "dedup") == 0) { + if (check_period(snapshots_path, cmd, arg1, &savet)) { + printf("run"); + fflush(stdout); + if (VerboseOpt) + printf("\n"); + r = cleanup_dedup(path, snapshots_path, + arg1, arg2); + } else { + printf("skip\n"); + } } else { printf("unknown directive\n"); r = 1; @@ -541,6 +555,7 @@ config_init(const char *path, struct hammer_ioc_config *config) snapshots, "prune 1d 5m\n" "rebalance 1d 5m\n" + "#dedup 1d 5m\n" "reblock 1d 5m\n" "recopy 30d 10m\n"); } @@ -1168,6 +1183,25 @@ cleanup_recopy(const char *path, const char *snapshots_path, return(0); } +static int +cleanup_dedup(const char *path, const char *snapshots_path __unused, + int arg1 __unused, int arg2) +{ + if (VerboseOpt == 0) { + printf("."); + fflush(stdout); + } + + runcmd(NULL, "hammer -t %d dedup %s", arg2, path); + if (VerboseOpt == 0) { + printf("."); + fflush(stdout); + } + if (VerboseOpt == 0) + printf("\n"); + return(0); +} + static void runcmd(int *resp, const char *ctl, ...) diff --git a/sbin/hammer/cmd_config.c b/sbin/hammer/cmd_config.c index 9a60146616..5e68e6059b 100644 --- a/sbin/hammer/cmd_config.c +++ b/sbin/hammer/cmd_config.c @@ -131,6 +131,7 @@ hammer_cmd_viconfig(char **av, int ac) "#snapshots 1d 60d\n" "#prune 1d 5m\n" "#rebalance 1d 5m\n" + "#dedup 1d 5m\n" "#reblock 1d 5m\n" "#recopy 30d 10m\n"); config.head.error = 0; diff --git a/sbin/hammer/cmd_dedup.c b/sbin/hammer/cmd_dedup.c new file mode 100644 index 0000000000..8dbdf85adf --- /dev/null +++ b/sbin/hammer/cmd_dedup.c @@ -0,0 +1,755 @@ +/* + * Copyright (c) 2010 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Ilya Dryomov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "hammer.h" +#include +#include + +#define DEDUP_BUF (64 * 1024) + +/* Sorted list of block CRCs - light version for dedup-simulate */ +struct sim_dedup_entry_rb_tree; +RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree = + RB_INITIALIZER(&sim_dedup_tree); +RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, + rb_sim_dedup_entry_compare, hammer_crc_t); + +struct sim_dedup_entry { + hammer_crc_t crc; + u_int64_t ref_blks; /* number of blocks referenced */ + u_int64_t ref_size; /* size of data referenced */ + RB_ENTRY(sim_dedup_entry) rb_entry; +}; + +/* Sorted list of HAMMER B-Tree keys */ +struct dedup_entry_rb_tree; +struct sha_dedup_entry_rb_tree; + +RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree = + RB_INITIALIZER(&dedup_tree); +RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry, + rb_dedup_entry_compare, hammer_crc_t); + +RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, + rb_sha_dedup_entry_compare); + +struct dedup_entry { + struct hammer_btree_leaf_elm leaf; + union { + struct { + u_int64_t ref_blks; + u_int64_t ref_size; + } de; + RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root; + } u; + u_int8_t flags; + RB_ENTRY(dedup_entry) rb_entry; +}; + +#define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001 + +struct sha_dedup_entry { + struct hammer_btree_leaf_elm leaf; + u_int64_t ref_blks; + u_int64_t ref_size; + u_int8_t sha_hash[SHA256_DIGEST_LENGTH]; + RB_ENTRY(sha_dedup_entry) fict_entry; +}; + +/* + * Pass2 list - contains entries that were not dedup'ed because ioctl failed + */ +STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue = + STAILQ_HEAD_INITIALIZER(pass2_dedup_queue); + +struct pass2_dedup_entry { + struct hammer_btree_leaf_elm leaf; + STAILQ_ENTRY(pass2_dedup_entry) sq_entry; +}; + +#define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */ + +/* PFS global ids - we deal with just one PFS at a run */ +int glob_fd; +struct hammer_ioc_pseudofs_rw glob_pfs; + +/* + * Global accounting variables + * + * Last three don't have to be 64-bit, just to be safe.. + */ +u_int64_t dedup_alloc_size = 0; +u_int64_t dedup_ref_size = 0; +u_int64_t dedup_skipped_size = 0; +u_int64_t dedup_crc_failures = 0; +u_int64_t dedup_sha_failures = 0; +u_int64_t dedup_underflows = 0; + +static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, + struct sim_dedup_entry *sim_de2); +static int rb_dedup_entry_compare(struct dedup_entry *de1, + struct dedup_entry *de2); +static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, + struct sha_dedup_entry *sha_de2); +typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags); +static void scan_pfs(char *filesystem, scan_pfs_cb_t func); +static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); +static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); +static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash); +static void dump_simulated_dedup(void); +static void dump_real_dedup(void); +static void dedup_usage(int code); + +RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, + rb_sim_dedup_entry_compare, hammer_crc_t, crc); +RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry, + rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc); +RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, + rb_sha_dedup_entry_compare); + +static int +rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, + struct sim_dedup_entry *sim_de2) +{ + if (sim_de1->crc < sim_de2->crc) + return (-1); + if (sim_de1->crc > sim_de2->crc) + return (1); + + return (0); +} + +static int +rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2) +{ + if (de1->leaf.data_crc < de2->leaf.data_crc) + return (-1); + if (de1->leaf.data_crc > de2->leaf.data_crc) + return (1); + + return (0); +} + +static int +rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, + struct sha_dedup_entry *sha_de2) +{ + unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash; + unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash; + int i; + + for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) { + if (h1[i] < h2[i]) + return (-1); + if (h1[i] > h2[i]) + return (1); + } + + return (0); +} + +/* + * dedup-simulate + */ +void +hammer_cmd_dedup_simulate(char **av, int ac) +{ + struct sim_dedup_entry *sim_de; + + if (ac != 1) + dedup_usage(1); + + glob_fd = getpfs(&glob_pfs, av[0]); + + printf("Dedup-simulate "); + scan_pfs(av[0], &collect_btree_elm); + printf("Dedup-simulate %s succeeded\n", av[0]); + + relpfs(glob_fd, &glob_pfs); + + if (VerboseOpt >= 2) + dump_simulated_dedup(); + + /* + * Calculate simulated dedup ratio and get rid of the tree + */ + while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) { + assert(sim_de->ref_blks != 0); + dedup_ref_size += sim_de->ref_size; + dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks; + + RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); + free(sim_de); + } + + printf("Simulated dedup ratio = %.2f\n", + (double)dedup_ref_size / dedup_alloc_size); +} + +/* + * dedup + */ +void +hammer_cmd_dedup(char **av, int ac) +{ + struct dedup_entry *de; + struct sha_dedup_entry *sha_de; + struct pass2_dedup_entry *pass2_de; + char buf[8]; + + if (TimeoutOpt > 0) + alarm(TimeoutOpt); + + if (ac != 1) + dedup_usage(1); + + STAILQ_INIT(&pass2_dedup_queue); + + glob_fd = getpfs(&glob_pfs, av[0]); + + printf("Dedup "); + scan_pfs(av[0], &process_btree_elm); + + while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) { + if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2)) + dedup_skipped_size -= pass2_de->leaf.data_len; + + STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry); + free(pass2_de); + } + assert(STAILQ_EMPTY(&pass2_dedup_queue)); + + printf("Dedup %s succeeded\n", av[0]); + + relpfs(glob_fd, &glob_pfs); + + if (VerboseOpt >= 2) + dump_real_dedup(); + + /* + * Calculate dedup ratio and get rid of the trees + */ + while ((de = RB_ROOT(&dedup_tree)) != NULL) { + if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { + while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) { + assert(sha_de->ref_blks != 0); + dedup_ref_size += sha_de->ref_size; + dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks; + + RB_REMOVE(sha_dedup_entry_rb_tree, + &de->u.fict_root, sha_de); + free(sha_de); + } + assert(RB_EMPTY(&de->u.fict_root)); + } else { + assert(de->u.de.ref_blks != 0); + dedup_ref_size += de->u.de.ref_size; + dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks; + } + + RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); + free(de); + } + assert(RB_EMPTY(&dedup_tree)); + + assert(dedup_alloc_size != 0); + humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024); + printf("Dedup ratio = %.2f\n" + " %8s referenced\n", + (double)dedup_ref_size / dedup_alloc_size, + buf + ); + humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024); + printf(" %8s allocated\n", buf); + humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024); + printf(" %8s skipped\n", buf); + printf(" %8jd CRC collisions\n" + " %8jd SHA collisions\n" + " %8jd bigblock underflows\n", + (intmax_t)dedup_crc_failures, + (intmax_t)dedup_sha_failures, + (intmax_t)dedup_underflows + ); +} + +static int +collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused) +{ + struct sim_dedup_entry *sim_de; + + sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, scan_leaf->data_crc); + + if (sim_de == NULL) { + sim_de = calloc(sizeof(*sim_de), 1); + sim_de->crc = scan_leaf->data_crc; + RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); + } + + sim_de->ref_blks += 1; + sim_de->ref_size += scan_leaf->data_len; + return (1); +} + +static __inline int +validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) +{ + if ((p->data_offset & HAMMER_OFF_ZONE_MASK) != + (q->data_offset & HAMMER_OFF_ZONE_MASK)) { + return (1); + } + if (p->data_len != q->data_len) { + return (1); + } + + return (0); +} + +#define DEDUP_TECH_FAILURE 1 +#define DEDUP_CMP_FAILURE 2 +#define DEDUP_INVALID_ZONE 3 +#define DEDUP_UNDERFLOW 4 + +static __inline int +deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) +{ + struct hammer_ioc_dedup dedup; + + bzero(&dedup, sizeof(dedup)); + + /* + * If data_offset fields are the same there is no need to run ioctl, + * candidate is already dedup'ed. + */ + if (p->data_offset == q->data_offset) { + return (0); + } + + dedup.elm1 = p->base; + dedup.elm2 = q->base; + RunningIoctl = 1; + if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) { + /* Technical failure - locking or w/e */ + return (DEDUP_TECH_FAILURE); + } + if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) { + return (DEDUP_CMP_FAILURE); + } + if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) { + return (DEDUP_INVALID_ZONE); + } + if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) { + return (DEDUP_UNDERFLOW); + } + RunningIoctl = 0; + + return (0); +} + +static int +process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags) +{ + struct dedup_entry *de; + struct sha_dedup_entry *sha_de, temp; + struct pass2_dedup_entry *pass2_de; + int error; + + de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc); + if (de == NULL) { + /* + * Insert new entry into CRC tree + */ + de = calloc(sizeof(*de), 1); + de->leaf = *scan_leaf; + RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de); + goto upgrade_stats; + } + + /* + * Found entry in CRC tree + */ + if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { + /* + * Entry in CRC tree is fictious, so we already had problems + * with this CRC. Upgrade (compute SHA) the candidate and + * dive into SHA subtree. If upgrade fails insert the candidate + * into Pass2 list (it will be processed later). + */ + if (upgrade_chksum(scan_leaf, temp.sha_hash)) + goto pass2_insert; + + sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp); + if (sha_de == NULL) { + /* + * Nothing in SHA subtree so far, so this is a new + * 'dataset'. Insert new entry into SHA subtree. + */ + sha_de = calloc(sizeof(*sha_de), 1); + sha_de->leaf = *scan_leaf; + memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); + RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); + goto upgrade_stats_sha; + } + + /* + * Found entry in SHA subtree, it means we have a potential + * dedup pair. Validate it (zones have to match and data_len + * field have to be the same too. If validation fails, treat + * it as a SHA collision (jump to sha256_failure). + */ + if (validate_dedup_pair(&sha_de->leaf, scan_leaf)) + goto sha256_failure; + + /* + * We have a valid dedup pair (SHA match, validated). + * + * In case of technical failure (dedup pair was good, but + * ioctl failed anyways) insert the candidate into Pass2 list + * (we will try to dedup it after we are done with the rest of + * the tree). + * + * If ioctl fails because either of blocks is in the non-dedup + * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't + * bother with the candidate and terminate early. + * + * If ioctl fails because of bigblock underflow replace the + * leaf node that found dedup entry represents with scan_leaf. + */ + error = deduplicate(&sha_de->leaf, scan_leaf); + switch(error) { + case DEDUP_TECH_FAILURE: + goto pass2_insert; + case DEDUP_CMP_FAILURE: + goto sha256_failure; + case DEDUP_INVALID_ZONE: + goto terminate_early; + case DEDUP_UNDERFLOW: + ++dedup_underflows; + sha_de->leaf = *scan_leaf; + memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); + goto upgrade_stats_sha; + default: + goto upgrade_stats_sha; + } + + /* + * Ooh la la.. SHA-256 collision. Terminate early, there's + * nothing we can do here. + */ +sha256_failure: + ++dedup_sha_failures; + goto terminate_early; + } else { + /* + * Candidate CRC is good for now (we found an entry in CRC + * tree and it's not fictious). This means we have a + * potential dedup pair. + */ + if (validate_dedup_pair(&de->leaf, scan_leaf)) + goto crc_failure; + + /* + * We have a valid dedup pair (CRC match, validated) + */ + error = deduplicate(&de->leaf, scan_leaf); + switch(error) { + case DEDUP_TECH_FAILURE: + goto pass2_insert; + case DEDUP_CMP_FAILURE: + goto crc_failure; + case DEDUP_INVALID_ZONE: + goto terminate_early; + case DEDUP_UNDERFLOW: + ++dedup_underflows; + de->leaf = *scan_leaf; + goto upgrade_stats; + default: + goto upgrade_stats; + } + +crc_failure: + /* + * We got a CRC collision - either ioctl failed because of + * the comparison failure or validation of the potential + * dedup pair went bad. In all cases insert both blocks + * into SHA subtree (this requires checksum upgrade) and mark + * entry that corresponds to this CRC in the CRC tree + * fictious, so that all futher operations with this CRC go + * through SHA subtree. + */ + ++dedup_crc_failures; + /* + * Insert block that was represented by now fictious dedup + * entry (create a new SHA entry and preserve stats of the + * old CRC one). If checksum upgrade fails insert the + * candidate into Pass2 list and return - keep both trees + * unmodified. + */ + sha_de = calloc(sizeof(*sha_de), 1); + sha_de->leaf = de->leaf; + sha_de->ref_blks = de->u.de.ref_blks; + sha_de->ref_size = de->u.de.ref_size; + if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) { + free(sha_de); + goto pass2_insert; + } + + RB_INIT(&de->u.fict_root); + /* + * Here we can insert without prior checking because the tree + * is empty at this point + */ + RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); + + /* + * Mark entry in CRC tree fictious + */ + de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS; + + /* + * Upgrade checksum of the candidate and insert it into + * SHA subtree. If upgrade fails insert the candidate into + * Pass2 list. + */ + if (upgrade_chksum(scan_leaf, temp.sha_hash)) { + goto pass2_insert; + } + sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp); + if (sha_de != NULL) + /* There is an entry with this SHA already, but the only + * RB-tree element at this point is that entry we just + * added. We know for sure these blocks are different + * (this is crc_failure branch) so treat it as SHA + * collision. + */ + goto sha256_failure; + + sha_de = calloc(sizeof(*sha_de), 1); + sha_de->leaf = *scan_leaf; + memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); + RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); + goto upgrade_stats_sha; + } + +upgrade_stats: + de->u.de.ref_blks += 1; + de->u.de.ref_size += scan_leaf->data_len; + return (1); + +upgrade_stats_sha: + sha_de->ref_blks += 1; + sha_de->ref_size += scan_leaf->data_len; + return (1); + +pass2_insert: + /* + * If in pass2 mode don't insert anything, fall through to + * terminate_early + */ + if ((flags & DEDUP_PASS2) == 0) { + pass2_de = calloc(sizeof(*pass2_de), 1); + pass2_de->leaf = *scan_leaf; + STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry); + dedup_skipped_size += scan_leaf->data_len; + return (1); + } + +terminate_early: + /* + * Early termination path. Fixup stats. + */ + dedup_alloc_size += scan_leaf->data_len; + dedup_ref_size += scan_leaf->data_len; + return (0); +} + +static int +upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash) +{ + struct hammer_ioc_data data; + char *buf = malloc(DEDUP_BUF); + SHA256_CTX ctx; + int error; + + bzero(&data, sizeof(data)); + data.elm = leaf->base; + data.ubuf = buf; + data.size = DEDUP_BUF; + + error = 0; + if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) { + fprintf(stderr, "Get-data failed: %s\n", strerror(errno)); + error = 1; + goto done; + } + + if (data.leaf.data_len != leaf->data_len) { + error = 1; + goto done; + } + + if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD && + data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) { + SHA256_Init(&ctx); + SHA256_Update(&ctx, (void *)buf, data.leaf.data_len); + SHA256_Final(sha_hash, &ctx); + } + +done: + free(buf); + return (error); +} + +static void +scan_pfs(char *filesystem, scan_pfs_cb_t func) +{ + struct hammer_ioc_mirror_rw mirror; + hammer_ioc_mrecord_any_t mrec; + struct hammer_btree_leaf_elm elm; + char *buf = malloc(DEDUP_BUF); + int offset, bytes; + + bzero(&mirror, sizeof(mirror)); + hammer_key_beg_init(&mirror.key_beg); + hammer_key_end_init(&mirror.key_end); + + mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid; + mirror.tid_end = glob_pfs.ondisk->sync_end_tid; + mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */ + mirror.ubuf = buf; + mirror.size = DEDUP_BUF; + mirror.pfs_id = glob_pfs.pfs_id; + mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; + + printf("%s: objspace %016jx:%04x %016jx:%04x pfs_id %d\n", + filesystem, + (uintmax_t)mirror.key_beg.obj_id, + mirror.key_beg.localization, + (uintmax_t)mirror.key_end.obj_id, + mirror.key_end.localization, + glob_pfs.pfs_id); + fflush(stdout); + + do { + mirror.count = 0; + mirror.pfs_id = glob_pfs.pfs_id; + mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; + if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) { + fprintf(stderr, "Mirror-read %s failed: %s\n", + filesystem, strerror(errno)); + exit(1); + } + if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) { + fprintf(stderr, "Mirror-read %s fatal error %d\n", + filesystem, mirror.head.error); + exit(1); + } + if (mirror.count) { + offset = 0; + while (offset < mirror.count) { + mrec = (void *)((char *)buf + offset); + bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size); + if (offset + bytes > mirror.count) { + fprintf(stderr, "Misaligned record\n"); + exit(1); + } + assert((mrec->head.type & + HAMMER_MRECF_TYPE_MASK) == HAMMER_MREC_TYPE_REC); + + elm = mrec->rec.leaf; + if (elm.base.btype == HAMMER_BTREE_TYPE_RECORD && + elm.base.rec_type == HAMMER_RECTYPE_DATA) { + func(&elm, 0); + } + offset += bytes; + } + } + mirror.key_beg = mirror.key_cur; + } while (mirror.count != 0); + + free(buf); +} + +static void +dump_simulated_dedup(void) +{ + struct sim_dedup_entry *sim_de; + + printf("=== Dumping simulated dedup entries:\n"); + RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) { + printf("\tcrc=%08x cnt=%llu size=%llu\n", sim_de->crc, + sim_de->ref_blks, sim_de->ref_size); + } + printf("end of dump ===\n"); +} + +static void +dump_real_dedup(void) +{ + struct dedup_entry *de; + struct sha_dedup_entry *sha_de; + int i; + + printf("=== Dumping dedup entries:\n"); + RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) { + if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { + printf("\tcrc=%08x fictious\n", de->leaf.data_crc); + + RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { + printf("\t\tcrc=%08x cnt=%llu size=%llu\n\t\t\tsha=", + sha_de->leaf.data_crc, + sha_de->ref_blks, + sha_de->ref_size); + for (i = 0; i < SHA256_DIGEST_LENGTH; ++i) + printf("%02x", sha_de->sha_hash[i]); + printf("\n"); + } + } else { + printf("\tcrc=%08x cnt=%llu size=%llu\n", + de->leaf.data_crc, + de->u.de.ref_blks, + de->u.de.ref_size); + } + } + printf("end of dump ===\n"); +} + +static void +dedup_usage(int code) +{ + fprintf(stderr, + "hammer dedup-simulate \n" + "hammer dedup \n" + ); + exit(code); +} diff --git a/sbin/hammer/hammer.8 b/sbin/hammer/hammer.8 index 96856678a6..ed007b2e56 100644 --- a/sbin/hammer/hammer.8 +++ b/sbin/hammer/hammer.8 @@ -32,7 +32,7 @@ .\" .\" $DragonFly: src/sbin/hammer/hammer.8,v 1.58 2008/11/13 02:04:27 dillon Exp $ .\" -.Dd February 12, 2010 +.Dd November 3, 2010 .Dt HAMMER 8 .Os .Sh NAME @@ -290,9 +290,7 @@ through normal file system operations or through reblocking, before it can be reused. .Pp Data blocks can be shared by deducting the space used from the free byte -count for each shared references, though -.Nm HAMMER -does not yet make use of this feature. +count for each shared references. This means the free byte count can legally go negative. .Pp This command needs the @@ -427,8 +425,8 @@ displays the mount point of the PFS is currently mounted on (if any). .El .\" ==== cleanup ==== .It Cm cleanup Op Ar filesystem ... -This is a meta-command which executes snapshot, prune, rebalance and reblock -commands on the specified +This is a meta-command which executes snapshot, prune, rebalance, dedup +and reblock commands on the specified .Nm HAMMER file systems. If no @@ -467,6 +465,7 @@ The format of the configuration file is: snapshots [any] prune rebalance +dedup reblock recopy .Ed @@ -476,6 +475,7 @@ Defaults are: snapshots 1d 60d # 0d 0d for PFS /tmp, /var/tmp, /usr/obj prune 1d 5m rebalance 1d 5m +dedup 1d 5m reblock 1d 5m recopy 30d 10m .Ed @@ -527,12 +527,12 @@ nightly via .Xr periodic 8 . .Pp The default configuration file will create a daily snapshot, do a daily -pruning, rebalancing and reblocking run and a monthly recopy run. +pruning, rebalancing, deduping and reblocking run and a monthly recopy run. Reblocking is defragmentation with a level of 95%, and recopy is full defragmentation. .Pp By default prune and rebalance operations are time limited to 5 minutes, -reblock operations to a bit over 5 minutes, +dedup and reblock operations to a bit over 5 minutes, and recopy operations to a bit over 10 minutes. Reblocking and recopy runs are each broken down into four separate functions: btree, inodes, dirs and data. @@ -854,6 +854,43 @@ suffix is not needed). Rebalancing is a per PFS operation, so a .Nm HAMMER file system and each PFS in it have to be rebalanced separately. +.\" ==== dedup ==== +.It Cm dedup Ar filesystem +.Nm ( HAMMER +VERSION 5+) +Perform offline (post-process) deduplication. Deduplication occurs at +the block level, currently only data blocks of the same size can be +deduped, metadata blocks can not. The hash function used for comparing +data blocks is CRC-32 (CRCs are computed anyways as part of +.Nm HAMMER +data integrity features, so there's no additional overhead). Since CRC +is a weak hash function a byte-by-byte comparison is done before actual +deduping. In case of a CRC collision (two data blocks have the same CRC +but different contents) the checksum is upgraded to SHA-256. +.Pp +Currently +.Nm HAMMER +reblocker may partially blow up (re-expand) dedup (reblocker's normal +operation is to reallocate every record, so it's possible for deduped +blocks to be re-expanded back). +.Pp +Deduplication is a per PFS operation, so a +.Nm HAMMER +file system and each PFS in it have to be deduped separately. This also +means that if you have duplicated data in two different PFSs that data +won't be deduped, however the addition of such feature is planned. +.\" ==== dedup-simulate ==== +.It Cm dedup-simulate Ar filesystem +.Nm ( HAMMER +VERSION 5+) +Shows potential space savings (simulated dedup ratio) one can get after +running +.Cm dedup +command. If the estimated dedup ratio is greater than 1.00 you will see +dedup space savings. Remember that this is an estimated number, in +practice real dedup ratio will be slightly smaller because of +.Nm HAMMER +bigblock underflows, B-Tree locking issues and other factors. .\" ==== reblock* ==== .It Cm reblock Ar filesystem Op Ar fill_percentage .It Cm reblock-btree Ar filesystem Op Ar fill_percentage diff --git a/sbin/hammer/hammer.c b/sbin/hammer/hammer.c index 80eedb01c2..b5c7f37529 100644 --- a/sbin/hammer/hammer.c +++ b/sbin/hammer/hammer.c @@ -416,6 +416,14 @@ main(int ac, char **av) usage(1); exit(0); } + if (strcmp(av[0], "dedup-simulate") == 0) { + hammer_cmd_dedup_simulate(av + 1, ac - 1); + exit(0); + } + if (strcmp(av[0], "dedup") == 0) { + hammer_cmd_dedup(av + 1, ac - 1); + exit(0); + } if (strcmp(av[0], "version") == 0) { hammer_cmd_get_version(av + 1, ac - 1); exit(0); @@ -595,6 +603,13 @@ usage(int exit_code) "hammer -f blkdevs recover \n" ); + fprintf(stderr, "\nHAMMER utility version 5+ commands:\n"); + + fprintf(stderr, + "hammer dedup-simulate \n" + "hammer dedup \n" + ); + exit(exit_code); } diff --git a/sbin/hammer/hammer.h b/sbin/hammer/hammer.h index 30edf2c574..91ca70e2c8 100644 --- a/sbin/hammer/hammer.h +++ b/sbin/hammer/hammer.h @@ -116,6 +116,8 @@ void hammer_cmd_get_version(char **av, int ac); void hammer_cmd_set_version(char **av, int ac); void hammer_cmd_volume_add(char **av, int ac); void hammer_cmd_volume_del(char **av, int ac); +void hammer_cmd_dedup_simulate(char **av, int ac); +void hammer_cmd_dedup(char **av, int ac); void hammer_get_cycle(hammer_base_elm_t base, hammer_tid_t *tidp); void hammer_set_cycle(hammer_base_elm_t base, hammer_tid_t tid); @@ -123,4 +125,3 @@ void hammer_reset_cycle(void); int getpfs(struct hammer_ioc_pseudofs_rw *pfs, const char *path); void relpfs(int fd, struct hammer_ioc_pseudofs_rw *pfs); - diff --git a/sys/conf/files b/sys/conf/files index 967c589c1d..9e9376820b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1703,6 +1703,7 @@ vfs/hammer/hammer_undo.c optional hammer vfs/hammer/hammer_redo.c optional hammer vfs/hammer/hammer_vfsops.c optional hammer vfs/hammer/hammer_vnops.c optional hammer +vfs/hammer/hammer_dedup.c optional hammer #tmpfs static filesystem for debugging should be marked as optional tmpfs vfs/tmpfs/tmpfs_fifoops.c standard vfs/tmpfs/tmpfs_subr.c standard diff --git a/sys/vfs/hammer/Makefile b/sys/vfs/hammer/Makefile index d548fef107..4d33e405f9 100644 --- a/sys/vfs/hammer/Makefile +++ b/sys/vfs/hammer/Makefile @@ -10,7 +10,8 @@ SRCS= hammer_vfsops.c hammer_vnops.c hammer_inode.c \ hammer_redo.c \ hammer_reblock.c hammer_rebalance.c \ hammer_flusher.c hammer_mirror.c \ - hammer_pfs.c hammer_prune.c hammer_volume.c + hammer_pfs.c hammer_prune.c hammer_volume.c \ + hammer_dedup.c SRCS+= opt_ktr.h .include diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 6fab020d41..3f8c5a68da 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -1081,14 +1081,16 @@ int hammer_cursor_down(hammer_cursor_t cursor); int hammer_cursor_upgrade(hammer_cursor_t cursor); int hammer_cursor_upgrade_node(hammer_cursor_t cursor); void hammer_cursor_downgrade(hammer_cursor_t cursor); +int hammer_cursor_upgrade2(hammer_cursor_t c1, hammer_cursor_t c2); +void hammer_cursor_downgrade2(hammer_cursor_t c1, hammer_cursor_t c2); int hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index); void hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident); int hammer_lock_ex_try(struct hammer_lock *lock); void hammer_lock_sh(struct hammer_lock *lock); int hammer_lock_sh_try(struct hammer_lock *lock); -int hammer_lock_upgrade(struct hammer_lock *lock); -void hammer_lock_downgrade(struct hammer_lock *lock); +int hammer_lock_upgrade(struct hammer_lock *lock, int shcount); +void hammer_lock_downgrade(struct hammer_lock *lock, int shcount); int hammer_lock_status(struct hammer_lock *lock); void hammer_unlock(struct hammer_lock *lock); void hammer_ref(struct hammer_lock *lock); @@ -1276,6 +1278,8 @@ void hammer_blockmap_reserve_complete(hammer_mount_t hmp, void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv); void hammer_blockmap_free(hammer_transaction_t trans, hammer_off_t bmap_off, int bytes); +int hammer_blockmap_dedup(hammer_transaction_t trans, + hammer_off_t bmap_off, int bytes); int hammer_blockmap_finalize(hammer_transaction_t trans, hammer_reserve_t resv, hammer_off_t bmap_off, int bytes); @@ -1410,6 +1414,8 @@ int hammer_ioc_volume_add(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_volume *ioc); int hammer_ioc_volume_del(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_volume *ioc); +int hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, + struct hammer_ioc_dedup *dedup); int hammer_signal_check(hammer_mount_t hmp); diff --git a/sys/vfs/hammer/hammer_blockmap.c b/sys/vfs/hammer/hammer_blockmap.c index b308eab332..35a0ccd1dc 100644 --- a/sys/vfs/hammer/hammer_blockmap.c +++ b/sys/vfs/hammer/hammer_blockmap.c @@ -898,6 +898,113 @@ failed: hammer_rel_buffer(buffer2, 0); } +int +hammer_blockmap_dedup(hammer_transaction_t trans, + hammer_off_t zone_offset, int bytes) +{ + hammer_mount_t hmp; + hammer_volume_t root_volume; + hammer_blockmap_t blockmap; + hammer_blockmap_t freemap; + struct hammer_blockmap_layer1 *layer1; + struct hammer_blockmap_layer2 *layer2; + hammer_buffer_t buffer1 = NULL; + hammer_buffer_t buffer2 = NULL; + hammer_off_t layer1_offset; + hammer_off_t layer2_offset; + int32_t temp; + int error; + int zone; + + if (bytes == 0) + return (0); + hmp = trans->hmp; + + /* + * Alignment + */ + bytes = (bytes + 15) & ~15; + KKASSERT(bytes <= HAMMER_LARGEBLOCK_SIZE); + KKASSERT(((zone_offset ^ (zone_offset + (bytes - 1))) & + ~HAMMER_LARGEBLOCK_MASK64) == 0); + + /* + * Basic zone validation & locking + */ + zone = HAMMER_ZONE_DECODE(zone_offset); + KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES); + root_volume = trans->rootvol; + error = 0; + + blockmap = &hmp->blockmap[zone]; + freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX]; + + /* + * Dive layer 1. + */ + layer1_offset = freemap->phys_offset + + HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset); + layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1); + if (error) + goto failed; + KKASSERT(layer1->phys_offset && + layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL); + if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) { + hammer_lock_ex(&hmp->blkmap_lock); + if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) + panic("CRC FAILED: LAYER1"); + hammer_unlock(&hmp->blkmap_lock); + } + + /* + * Dive layer 2, each entry represents a large-block. + */ + layer2_offset = layer1->phys_offset + + HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset); + layer2 = hammer_bread(hmp, layer2_offset, &error, &buffer2); + if (error) + goto failed; + if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) { + hammer_lock_ex(&hmp->blkmap_lock); + if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) + panic("CRC FAILED: LAYER2"); + hammer_unlock(&hmp->blkmap_lock); + } + + hammer_lock_ex(&hmp->blkmap_lock); + + hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2)); + + /* + * Free space previously allocated via blockmap_alloc(). + * + * NOTE: bytes_free can be and remain negative due to de-dup ops + * but can never become larger than HAMMER_LARGEBLOCK_SIZE. + */ + KKASSERT(layer2->zone == zone); + temp = layer2->bytes_free - HAMMER_LARGEBLOCK_SIZE * 2; + cpu_ccfence(); /* prevent gcc from optimizing temp out */ + if (temp > layer2->bytes_free) { + error = ERANGE; + goto underflow; + } + layer2->bytes_free -= bytes; + + KKASSERT(layer2->bytes_free <= HAMMER_LARGEBLOCK_SIZE); + + layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE); +underflow: + hammer_modify_buffer_done(buffer2); + hammer_unlock(&hmp->blkmap_lock); + +failed: + if (buffer1) + hammer_rel_buffer(buffer1, 0); + if (buffer2) + hammer_rel_buffer(buffer2, 0); + return (error); +} + /* * Backend function - finalize (offset, bytes) in a zone. * diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index f6de3fa442..a88f5e1d95 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -220,12 +220,12 @@ hammer_cursor_upgrade(hammer_cursor_t cursor) { int error; - error = hammer_lock_upgrade(&cursor->node->lock); + error = hammer_lock_upgrade(&cursor->node->lock, 1); if (error && cursor->deadlk_node == NULL) { cursor->deadlk_node = cursor->node; hammer_ref_node(cursor->deadlk_node); } else if (error == 0 && cursor->parent) { - error = hammer_lock_upgrade(&cursor->parent->lock); + error = hammer_lock_upgrade(&cursor->parent->lock, 1); if (error && cursor->deadlk_node == NULL) { cursor->deadlk_node = cursor->parent; hammer_ref_node(cursor->deadlk_node); @@ -239,7 +239,7 @@ hammer_cursor_upgrade_node(hammer_cursor_t cursor) { int error; - error = hammer_lock_upgrade(&cursor->node->lock); + error = hammer_lock_upgrade(&cursor->node->lock, 1); if (error && cursor->deadlk_node == NULL) { cursor->deadlk_node = cursor->node; hammer_ref_node(cursor->deadlk_node); @@ -255,14 +255,89 @@ void hammer_cursor_downgrade(hammer_cursor_t cursor) { if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) - hammer_lock_downgrade(&cursor->node->lock); + hammer_lock_downgrade(&cursor->node->lock, 1); if (cursor->parent && hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { - hammer_lock_downgrade(&cursor->parent->lock); + hammer_lock_downgrade(&cursor->parent->lock, 1); } } /* + * Upgrade and downgrade pairs of cursors. This is used by the dedup + * code which must deal with two cursors. A special function is needed + * because some of the nodes may be shared between the two cursors, + * resulting in share counts > 1 which will normally cause an upgrade + * to fail. + */ +static __noinline +int +collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) +{ + int i; + + for (i = 0; i < n; ++i) { + if (array[i] == node) + break; + } + if (i == n) { + array[i] = node; + counts[i] = 1; + ++i; + } else { + ++counts[i]; + } + return(i); +} + +int +hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) +{ + hammer_node_t nodes[4]; + int counts[4]; + int error; + int i; + int n; + + n = collect_node(nodes, counts, 0, cursor1->node); + if (cursor1->parent) + n = collect_node(nodes, counts, n, cursor1->parent); + n = collect_node(nodes, counts, n, cursor2->node); + if (cursor2->parent) + n = collect_node(nodes, counts, n, cursor2->parent); + + error = 0; + for (i = 0; i < n; ++i) { + error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); + if (error) + break; + } + if (error) { + while (--i >= 0) + hammer_lock_downgrade(&nodes[i]->lock, counts[i]); + } + return (error); +} + +void +hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) +{ + hammer_node_t nodes[4]; + int counts[4]; + int i; + int n; + + n = collect_node(nodes, counts, 0, cursor1->node); + if (cursor1->parent) + n = collect_node(nodes, counts, n, cursor1->parent); + n = collect_node(nodes, counts, n, cursor2->node); + if (cursor2->parent) + n = collect_node(nodes, counts, n, cursor2->parent); + + for (i = 0; i < n; ++i) + hammer_lock_downgrade(&nodes[i]->lock, counts[i]); +} + +/* * Seek the cursor to the specified node and index. * * The caller must ref the node prior to calling this routine and release diff --git a/sys/vfs/hammer/hammer_dedup.c b/sys/vfs/hammer/hammer_dedup.c new file mode 100644 index 0000000000..7c724cdd38 --- /dev/null +++ b/sys/vfs/hammer/hammer_dedup.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2010 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Ilya Dryomov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "hammer.h" + +static __inline int validate_zone(hammer_off_t data_offset); + +int +hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, + struct hammer_ioc_dedup *dedup) +{ + struct hammer_cursor cursor1, cursor2; + int error; + + /* + * Cursor1, return an error -> candidate goes to pass2 list + */ + error = hammer_init_cursor(trans, &cursor1, NULL, NULL); + if (error) + goto done_cursor; + cursor1.key_beg = dedup->elm1; + cursor1.flags |= HAMMER_CURSOR_BACKEND; + + error = hammer_btree_lookup(&cursor1); + if (error) + goto done_cursor; + error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF | + HAMMER_CURSOR_GET_DATA); + if (error) + goto done_cursor; + + /* + * Cursor2, return an error -> candidate goes to pass2 list + */ + error = hammer_init_cursor(trans, &cursor2, NULL, NULL); + if (error) + goto done_cursors; + cursor2.key_beg = dedup->elm2; + cursor2.flags |= HAMMER_CURSOR_BACKEND; + + error = hammer_btree_lookup(&cursor2); + if (error) + goto done_cursors; + error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF | + HAMMER_CURSOR_GET_DATA); + if (error) + goto done_cursors; + + /* + * Zone validation. We can't de-dup any of the other zones + * (BTREE or META) or bad things will happen. + * + * Return with error = 0, but set an INVALID_ZONE flag. + */ + error = validate_zone(cursor1.leaf->data_offset) + + validate_zone(cursor2.leaf->data_offset); + if (error) { + dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE; + error = 0; + goto done_cursors; + } + + /* + * Comparison checks + * + * If zones don't match or data_len fields aren't the same + * we consider it to be a comparison failure. + * + * Return with error = 0, but set a CMP_FAILURE flag. + */ + if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) != + (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) { + dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; + goto done_cursors; + } + if (cursor1.leaf->data_len != cursor2.leaf->data_len) { + dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; + goto done_cursors; + } + + /* byte-by-byte comparison to be sure */ + if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) { + dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; + goto done_cursors; + } + + /* + * Upgrade both cursors together to an exclusive lock + * + * Return an error -> candidate goes to pass2 list + */ + hammer_sync_lock_sh(trans); + error = hammer_cursor_upgrade2(&cursor1, &cursor2); + if (error) { + hammer_sync_unlock(trans); + goto done_cursors; + } + + error = hammer_blockmap_dedup(cursor1.trans, + cursor1.leaf->data_offset, cursor1.leaf->data_len); + if (error) { + if (error == ERANGE) { + /* + * Return with error = 0, but set an UNDERFLOW flag + */ + dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW; + error = 0; + goto downgrade_cursors; + } else { + /* + * Return an error -> block goes to pass2 list + */ + goto downgrade_cursors; + } + } + + /* + * The cursor2's cache must be invalidated before calling + * hammer_blockmap_free(), otherwise it will not be able to + * invalidate the underlying data buffer. + */ + hammer_cursor_invalidate_cache(&cursor2); + hammer_blockmap_free(cursor2.trans, + cursor2.leaf->data_offset, cursor2.leaf->data_len); + + hammer_modify_node(cursor2.trans, cursor2.node, + &cursor2.leaf->data_offset, sizeof(hammer_off_t)); + cursor2.leaf->data_offset = cursor1.leaf->data_offset; + hammer_modify_node_done(cursor2.node); + +downgrade_cursors: + hammer_cursor_downgrade2(&cursor1, &cursor2); + hammer_sync_unlock(trans); +done_cursors: + hammer_done_cursor(&cursor2); +done_cursor: + hammer_done_cursor(&cursor1); + return (error); +} + +static __inline int +validate_zone(hammer_off_t data_offset) +{ + switch(data_offset & HAMMER_OFF_ZONE_MASK) { + case HAMMER_ZONE_LARGE_DATA: + case HAMMER_ZONE_SMALL_DATA: + return (0); + default: + return (1); + } +} diff --git a/sys/vfs/hammer/hammer_ioctl.c b/sys/vfs/hammer/hammer_ioctl.c index f8fe994f90..7d8653d1b5 100644 --- a/sys/vfs/hammer/hammer_ioctl.c +++ b/sys/vfs/hammer/hammer_ioctl.c @@ -58,6 +58,8 @@ static int hammer_ioc_get_config(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_config *snap); static int hammer_ioc_set_config(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_config *snap); +static int hammer_ioc_get_data(hammer_transaction_t trans, hammer_inode_t ip, + struct hammer_ioc_data *data); int hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag, @@ -210,6 +212,18 @@ hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag, &trans, ip, (struct hammer_ioc_config *)data); } break; + case HAMMERIOC_DEDUP: + if (error == 0) { + error = hammer_ioc_dedup( + &trans, ip, (struct hammer_ioc_dedup *)data); + } + break; + case HAMMERIOC_GET_DATA: + if (error == 0) { + error = hammer_ioc_get_data( + &trans, ip, (struct hammer_ioc_data *)data); + } + break; default: error = EOPNOTSUPP; break; @@ -991,3 +1005,38 @@ again: hammer_done_cursor(&cursor); return(0); } + +static +int +hammer_ioc_get_data(hammer_transaction_t trans, hammer_inode_t ip, + struct hammer_ioc_data *data) +{ + struct hammer_cursor cursor; + int bytes; + int error; + + /* XXX cached inode ? */ + error = hammer_init_cursor(trans, &cursor, NULL, NULL); + if (error) + goto failed; + + cursor.key_beg = data->elm; + cursor.flags |= HAMMER_CURSOR_BACKEND; + + error = hammer_btree_lookup(&cursor); + if (error == 0) { + error = hammer_btree_extract(&cursor, HAMMER_CURSOR_GET_LEAF | + HAMMER_CURSOR_GET_DATA); + if (error == 0) { + data->leaf = *cursor.leaf; + bytes = cursor.leaf->data_len; + if (bytes > data->size) + bytes = data->size; + error = copyout(cursor.data, data->ubuf, bytes); + } + } + +failed: + hammer_done_cursor(&cursor); + return (error); +} diff --git a/sys/vfs/hammer/hammer_ioctl.h b/sys/vfs/hammer/hammer_ioctl.h index f0778ed48f..2c8e979ceb 100644 --- a/sys/vfs/hammer/hammer_ioctl.h +++ b/sys/vfs/hammer/hammer_ioctl.h @@ -438,6 +438,30 @@ struct hammer_ioc_config { }; /* + * HAMMERIOC_DEDUP + */ +struct hammer_ioc_dedup { + struct hammer_ioc_head head; + struct hammer_base_elm elm1; + struct hammer_base_elm elm2; /* candidate for dedup */ +}; + +#define HAMMER_IOC_DEDUP_CMP_FAILURE 0x0001 /* verification failed */ +#define HAMMER_IOC_DEDUP_UNDERFLOW 0x0002 /* bigblock underflow */ +#define HAMMER_IOC_DEDUP_INVALID_ZONE 0x0004 /* we can't dedup all zones */ + +/* + * HAMMERIOC_GET_DATA + */ +struct hammer_ioc_data { + struct hammer_ioc_head head; + struct hammer_base_elm elm; /* btree key to lookup */ + struct hammer_btree_leaf_elm leaf; + void *ubuf; /* user buffer */ + int size; /* max size */ +}; + +/* * Ioctl cmd ids */ #define HAMMERIOC_PRUNE _IOWR('h',1,struct hammer_ioc_prune) @@ -463,6 +487,8 @@ struct hammer_ioc_config { #define HAMMERIOC_GET_CONFIG _IOWR('h',21,struct hammer_ioc_config) #define HAMMERIOC_SET_CONFIG _IOWR('h',22,struct hammer_ioc_config) #define HAMMERIOC_DEL_VOLUME _IOWR('h',24,struct hammer_ioc_volume) +#define HAMMERIOC_DEDUP _IOWR('h',25,struct hammer_ioc_dedup) +#define HAMMERIOC_GET_DATA _IOWR('h',26,struct hammer_ioc_data) #endif diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index 39f1a59c9c..f38be11f09 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -206,7 +206,7 @@ hammer_lock_sh_try(struct hammer_lock *lock) * by someone else, this function will panic. */ int -hammer_lock_upgrade(struct hammer_lock *lock) +hammer_lock_upgrade(struct hammer_lock *lock, int shcount) { thread_t td = curthread; u_int lv; @@ -216,7 +216,7 @@ hammer_lock_upgrade(struct hammer_lock *lock) for (;;) { lv = lock->lockval; - if ((lv & ~HAMMER_LOCKF_WANTED) == 1) { + if ((lv & ~HAMMER_LOCKF_WANTED) == shcount) { nlv = lv | HAMMER_LOCKF_EXCLUSIVE; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { lock->lowner = td; @@ -245,14 +245,14 @@ hammer_lock_upgrade(struct hammer_lock *lock) * Downgrade an exclusively held lock to a shared lock. */ void -hammer_lock_downgrade(struct hammer_lock *lock) +hammer_lock_downgrade(struct hammer_lock *lock, int shcount) { thread_t td __debugvar = curthread; u_int lv; u_int nlv; KKASSERT((lock->lockval & ~HAMMER_LOCKF_WANTED) == - (HAMMER_LOCKF_EXCLUSIVE | 1)); + (HAMMER_LOCKF_EXCLUSIVE | shcount)); KKASSERT(lock->lowner == td); /* -- 2.11.4.GIT