2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Ilya Dryomov <idryomov@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #include <crypto/sha2/sha2.h>
39 #define DEDUP_BUF (64 * 1024)
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree
;
43 RB_HEAD(sim_dedup_entry_rb_tree
, sim_dedup_entry
) sim_dedup_tree
=
44 RB_INITIALIZER(&sim_dedup_tree
);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree
, sim_dedup_entry
, rb_entry
,
46 rb_sim_dedup_entry_compare
, hammer_crc_t
);
48 struct sim_dedup_entry
{
50 u_int64_t ref_blks
; /* number of blocks referenced */
51 u_int64_t ref_size
; /* size of data referenced */
52 RB_ENTRY(sim_dedup_entry
) rb_entry
;
56 struct hammer_btree_leaf_elm leaf
;
62 RB_HEAD(sha_dedup_entry_rb_tree
, sha_dedup_entry
) fict_root
;
65 RB_ENTRY(dedup_entry
) rb_entry
;
68 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001
70 struct sha_dedup_entry
{
71 struct hammer_btree_leaf_elm leaf
;
74 u_int8_t sha_hash
[SHA256_DIGEST_LENGTH
];
75 RB_ENTRY(sha_dedup_entry
) fict_entry
;
78 /* Sorted list of HAMMER B-Tree keys */
79 struct dedup_entry_rb_tree
;
80 struct sha_dedup_entry_rb_tree
;
82 RB_HEAD(dedup_entry_rb_tree
, dedup_entry
) dedup_tree
=
83 RB_INITIALIZER(&dedup_tree
);
84 RB_PROTOTYPE2(dedup_entry_rb_tree
, dedup_entry
, rb_entry
,
85 rb_dedup_entry_compare
, hammer_crc_t
);
87 RB_PROTOTYPE(sha_dedup_entry_rb_tree
, sha_dedup_entry
, fict_entry
,
88 rb_sha_dedup_entry_compare
);
91 * Pass2 list - contains entries that were not dedup'ed because ioctl failed
93 STAILQ_HEAD(, pass2_dedup_entry
) pass2_dedup_queue
=
94 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue
);
96 struct pass2_dedup_entry
{
97 struct hammer_btree_leaf_elm leaf
;
98 STAILQ_ENTRY(pass2_dedup_entry
) sq_entry
;
101 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */
103 static int SigInfoFlag
;
104 static int SigAlrmFlag
;
105 static int64_t DedupDataReads
;
106 static int64_t DedupCurrentRecords
;
107 static int64_t DedupTotalRecords
;
108 static u_int32_t DedupCrcStart
;
109 static u_int32_t DedupCrcEnd
;
110 static u_int64_t MemoryUse
;
112 /* PFS global ids - we deal with just one PFS at a run */
114 struct hammer_ioc_pseudofs_rw glob_pfs
;
117 * Global accounting variables
119 * Last three don't have to be 64-bit, just to be safe..
121 u_int64_t dedup_alloc_size
= 0;
122 u_int64_t dedup_ref_size
= 0;
123 u_int64_t dedup_skipped_size
= 0;
124 u_int64_t dedup_crc_failures
= 0;
125 u_int64_t dedup_sha_failures
= 0;
126 u_int64_t dedup_underflows
= 0;
127 u_int64_t dedup_successes_count
= 0;
128 u_int64_t dedup_successes_bytes
= 0;
130 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry
*sim_de1
,
131 struct sim_dedup_entry
*sim_de2
);
132 static int rb_dedup_entry_compare(struct dedup_entry
*de1
,
133 struct dedup_entry
*de2
);
134 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry
*sha_de1
,
135 struct sha_dedup_entry
*sha_de2
);
136 typedef int (*scan_pfs_cb_t
)(hammer_btree_leaf_elm_t scan_leaf
, int flags
);
137 static void scan_pfs(char *filesystem
, scan_pfs_cb_t func
, const char *id
);
138 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf
, int flags
);
139 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf
, int flags
);
140 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf
, int flags
);
141 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf
, u_int8_t
*sha_hash
);
142 static void dump_simulated_dedup(void);
143 static void dump_real_dedup(void);
144 static void dedup_usage(int code
);
146 RB_GENERATE2(sim_dedup_entry_rb_tree
, sim_dedup_entry
, rb_entry
,
147 rb_sim_dedup_entry_compare
, hammer_crc_t
, crc
);
148 RB_GENERATE2(dedup_entry_rb_tree
, dedup_entry
, rb_entry
,
149 rb_dedup_entry_compare
, hammer_crc_t
, leaf
.data_crc
);
150 RB_GENERATE(sha_dedup_entry_rb_tree
, sha_dedup_entry
, fict_entry
,
151 rb_sha_dedup_entry_compare
);
154 rb_sim_dedup_entry_compare(struct sim_dedup_entry
*sim_de1
,
155 struct sim_dedup_entry
*sim_de2
)
157 if (sim_de1
->crc
< sim_de2
->crc
)
159 if (sim_de1
->crc
> sim_de2
->crc
)
166 rb_dedup_entry_compare(struct dedup_entry
*de1
, struct dedup_entry
*de2
)
168 if (de1
->leaf
.data_crc
< de2
->leaf
.data_crc
)
170 if (de1
->leaf
.data_crc
> de2
->leaf
.data_crc
)
177 rb_sha_dedup_entry_compare(struct sha_dedup_entry
*sha_de1
,
178 struct sha_dedup_entry
*sha_de2
)
180 unsigned long *h1
= (unsigned long *)&sha_de1
->sha_hash
;
181 unsigned long *h2
= (unsigned long *)&sha_de2
->sha_hash
;
184 for (i
= 0; i
< SHA256_DIGEST_LENGTH
/ (int)sizeof(unsigned long); ++i
) {
195 * dedup-simulate <filesystem>
198 hammer_cmd_dedup_simulate(char **av
, int ac
)
200 struct sim_dedup_entry
*sim_de
;
205 glob_fd
= getpfs(&glob_pfs
, av
[0]);
208 * Collection passes (memory limited)
210 printf("Dedup-simulate running\n");
212 DedupCrcStart
= DedupCrcEnd
;
217 printf("b-tree pass crc-range %08x-max\n",
221 scan_pfs(av
[0], collect_btree_elm
, "simu-pass");
224 dump_simulated_dedup();
227 * Calculate simulated dedup ratio and get rid of the tree
229 while ((sim_de
= RB_ROOT(&sim_dedup_tree
)) != NULL
) {
230 assert(sim_de
->ref_blks
!= 0);
231 dedup_ref_size
+= sim_de
->ref_size
;
232 dedup_alloc_size
+= sim_de
->ref_size
/ sim_de
->ref_blks
;
234 RB_REMOVE(sim_dedup_entry_rb_tree
, &sim_dedup_tree
, sim_de
);
237 if (DedupCrcEnd
&& VerboseOpt
== 0)
239 } while (DedupCrcEnd
);
241 printf("Dedup-simulate %s succeeded\n", av
[0]);
242 relpfs(glob_fd
, &glob_pfs
);
244 printf("Simulated dedup ratio = %.2f\n",
245 (dedup_alloc_size
!= 0) ?
246 (double)dedup_ref_size
/ dedup_alloc_size
: 0);
253 hammer_cmd_dedup(char **av
, int ac
)
255 struct dedup_entry
*de
;
256 struct sha_dedup_entry
*sha_de
;
257 struct pass2_dedup_entry
*pass2_de
;
268 STAILQ_INIT(&pass2_dedup_queue
);
270 glob_fd
= getpfs(&glob_pfs
, av
[0]);
273 * A cycle file is _required_ for resuming dedup after the timeout
274 * specified with -t has expired. If no -c option, then place a
275 * .dedup.cycle file either in the PFS snapshots directory or in
276 * the default snapshots directory.
279 if (glob_pfs
.ondisk
->snapshots
[0] != '/')
280 asprintf(&tmp
, "%s/%s/.dedup.cycle",
281 SNAPSHOTS_BASE
, av
[0]);
283 asprintf(&tmp
, "%s/.dedup.cycle",
284 glob_pfs
.ondisk
->snapshots
);
290 * Pre-pass to cache the btree
292 scan_pfs(av
[0], count_btree_elm
, "pre-pass ");
293 DedupTotalRecords
= DedupCurrentRecords
;
296 * Collection passes (memory limited)
298 printf("Dedup running\n");
300 DedupCrcStart
= DedupCrcEnd
;
305 printf("b-tree pass crc-range %08x-max\n",
309 scan_pfs(av
[0], process_btree_elm
, "main-pass");
311 while ((pass2_de
= STAILQ_FIRST(&pass2_dedup_queue
)) != NULL
) {
312 if (process_btree_elm(&pass2_de
->leaf
, DEDUP_PASS2
))
313 dedup_skipped_size
-= pass2_de
->leaf
.data_len
;
315 STAILQ_REMOVE_HEAD(&pass2_dedup_queue
, sq_entry
);
318 assert(STAILQ_EMPTY(&pass2_dedup_queue
));
324 * Calculate dedup ratio and get rid of the trees
326 while ((de
= RB_ROOT(&dedup_tree
)) != NULL
) {
327 if (de
->flags
& HAMMER_DEDUP_ENTRY_FICTITIOUS
) {
328 while ((sha_de
= RB_ROOT(&de
->u
.fict_root
)) != NULL
) {
329 assert(sha_de
->ref_blks
!= 0);
330 dedup_ref_size
+= sha_de
->ref_size
;
331 dedup_alloc_size
+= sha_de
->ref_size
/ sha_de
->ref_blks
;
333 RB_REMOVE(sha_dedup_entry_rb_tree
,
334 &de
->u
.fict_root
, sha_de
);
337 assert(RB_EMPTY(&de
->u
.fict_root
));
339 assert(de
->u
.de
.ref_blks
!= 0);
340 dedup_ref_size
+= de
->u
.de
.ref_size
;
341 dedup_alloc_size
+= de
->u
.de
.ref_size
/ de
->u
.de
.ref_blks
;
344 RB_REMOVE(dedup_entry_rb_tree
, &dedup_tree
, de
);
347 assert(RB_EMPTY(&dedup_tree
));
348 if (DedupCrcEnd
&& VerboseOpt
== 0)
350 } while (DedupCrcEnd
);
352 printf("Dedup %s succeeded\n", av
[0]);
353 relpfs(glob_fd
, &glob_pfs
);
355 humanize_unsigned(buf
, sizeof(buf
), dedup_ref_size
, "B", 1024);
356 printf("Dedup ratio = %.2f (in this run)\n"
358 ((dedup_alloc_size
!= 0) ?
359 (double)dedup_ref_size
/ dedup_alloc_size
: 0),
362 humanize_unsigned(buf
, sizeof(buf
), dedup_alloc_size
, "B", 1024);
363 printf(" %8s allocated\n", buf
);
364 humanize_unsigned(buf
, sizeof(buf
), dedup_skipped_size
, "B", 1024);
365 printf(" %8s skipped\n", buf
);
366 printf(" %8jd CRC collisions\n"
367 " %8jd SHA collisions\n"
368 " %8jd bigblock underflows\n"
369 " %8jd new dedup records\n"
370 " %8jd new dedup bytes\n",
371 (intmax_t)dedup_crc_failures
,
372 (intmax_t)dedup_sha_failures
,
373 (intmax_t)dedup_underflows
,
374 (intmax_t)dedup_successes_count
,
375 (intmax_t)dedup_successes_bytes
378 /* Once completed remove cycle file */
379 hammer_reset_cycle();
381 /* We don't want to mess up with other directives */
389 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused
, int flags __unused
)
395 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf
, int flags __unused
)
397 struct sim_dedup_entry
*sim_de
;
400 * If we are using too much memory we have to clean some out, which
401 * will cause the run to use multiple passes. Be careful of integer
404 if (MemoryUse
> MemoryLimit
) {
405 DedupCrcEnd
= DedupCrcStart
+
406 (u_int32_t
)(DedupCrcEnd
- DedupCrcStart
- 1) / 2;
408 printf("memory limit crc-range %08x-%08x\n",
409 DedupCrcStart
, DedupCrcEnd
);
413 sim_de
= RB_MAX(sim_dedup_entry_rb_tree
,
415 if (sim_de
== NULL
|| sim_de
->crc
< DedupCrcEnd
)
417 RB_REMOVE(sim_dedup_entry_rb_tree
,
418 &sim_dedup_tree
, sim_de
);
419 MemoryUse
-= sizeof(*sim_de
);
425 * Collect statistics based on the CRC only, do not try to read
426 * any data blocks or run SHA hashes.
428 sim_de
= RB_LOOKUP(sim_dedup_entry_rb_tree
, &sim_dedup_tree
,
429 scan_leaf
->data_crc
);
431 if (sim_de
== NULL
) {
432 sim_de
= calloc(sizeof(*sim_de
), 1);
433 sim_de
->crc
= scan_leaf
->data_crc
;
434 RB_INSERT(sim_dedup_entry_rb_tree
, &sim_dedup_tree
, sim_de
);
435 MemoryUse
+= sizeof(*sim_de
);
438 sim_de
->ref_blks
+= 1;
439 sim_de
->ref_size
+= scan_leaf
->data_len
;
444 validate_dedup_pair(hammer_btree_leaf_elm_t p
, hammer_btree_leaf_elm_t q
)
446 if ((p
->data_offset
& HAMMER_OFF_ZONE_MASK
) !=
447 (q
->data_offset
& HAMMER_OFF_ZONE_MASK
)) {
450 if (p
->data_len
!= q
->data_len
) {
457 #define DEDUP_TECH_FAILURE 1
458 #define DEDUP_CMP_FAILURE 2
459 #define DEDUP_INVALID_ZONE 3
460 #define DEDUP_UNDERFLOW 4
461 #define DEDUP_VERS_FAILURE 5
464 deduplicate(hammer_btree_leaf_elm_t p
, hammer_btree_leaf_elm_t q
)
466 struct hammer_ioc_dedup dedup
;
468 bzero(&dedup
, sizeof(dedup
));
471 * If data_offset fields are the same there is no need to run ioctl,
472 * candidate is already dedup'ed.
474 if (p
->data_offset
== q
->data_offset
) {
478 dedup
.elm1
= p
->base
;
479 dedup
.elm2
= q
->base
;
481 if (ioctl(glob_fd
, HAMMERIOC_DEDUP
, &dedup
) < 0) {
482 if (errno
== EOPNOTSUPP
) {
483 /* must be at least version 5 */
484 return (DEDUP_VERS_FAILURE
);
486 /* Technical failure - locking or w/e */
487 return (DEDUP_TECH_FAILURE
);
489 if (dedup
.head
.flags
& HAMMER_IOC_DEDUP_CMP_FAILURE
) {
490 return (DEDUP_CMP_FAILURE
);
492 if (dedup
.head
.flags
& HAMMER_IOC_DEDUP_INVALID_ZONE
) {
493 return (DEDUP_INVALID_ZONE
);
495 if (dedup
.head
.flags
& HAMMER_IOC_DEDUP_UNDERFLOW
) {
496 return (DEDUP_UNDERFLOW
);
499 ++dedup_successes_count
;
500 dedup_successes_bytes
+= p
->data_len
;
505 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf
, int flags
)
507 struct dedup_entry
*de
;
508 struct sha_dedup_entry
*sha_de
, temp
;
509 struct pass2_dedup_entry
*pass2_de
;
513 * If we are using too much memory we have to clean some out, which
514 * will cause the run to use multiple passes. Be careful of integer
517 while (MemoryUse
> MemoryLimit
) {
518 DedupCrcEnd
= DedupCrcStart
+
519 (u_int32_t
)(DedupCrcEnd
- DedupCrcStart
- 1) / 2;
521 printf("memory limit crc-range %08x-%08x\n",
522 DedupCrcStart
, DedupCrcEnd
);
527 de
= RB_MAX(dedup_entry_rb_tree
, &dedup_tree
);
528 if (de
== NULL
|| de
->leaf
.data_crc
< DedupCrcEnd
)
530 if (de
->flags
& HAMMER_DEDUP_ENTRY_FICTITIOUS
) {
531 while ((sha_de
= RB_ROOT(&de
->u
.fict_root
)) !=
533 RB_REMOVE(sha_dedup_entry_rb_tree
,
534 &de
->u
.fict_root
, sha_de
);
535 MemoryUse
-= sizeof(*sha_de
);
539 RB_REMOVE(dedup_entry_rb_tree
, &dedup_tree
, de
);
540 MemoryUse
-= sizeof(*de
);
546 * Collect statistics based on the CRC. Colliding CRCs usually
547 * cause a SHA sub-tree to be created under the de.
549 * Trivial case if de not found.
551 de
= RB_LOOKUP(dedup_entry_rb_tree
, &dedup_tree
, scan_leaf
->data_crc
);
553 de
= calloc(sizeof(*de
), 1);
554 de
->leaf
= *scan_leaf
;
555 RB_INSERT(dedup_entry_rb_tree
, &dedup_tree
, de
);
556 MemoryUse
+= sizeof(*de
);
561 * Found entry in CRC tree
563 if (de
->flags
& HAMMER_DEDUP_ENTRY_FICTITIOUS
) {
565 * Optimize the case where a CRC failure results in multiple
566 * SHA entries. If we unconditionally issue a data-read a
567 * degenerate situation where a colliding CRC's second SHA
568 * entry contains the lion's share of the deduplication
569 * candidates will result in excessive data block reads.
571 * Deal with the degenerate case by looking for a matching
572 * data_offset/data_len in the SHA elements we already have
573 * before reading the data block and generating a new SHA.
575 RB_FOREACH(sha_de
, sha_dedup_entry_rb_tree
, &de
->u
.fict_root
) {
576 if (sha_de
->leaf
.data_offset
==
577 scan_leaf
->data_offset
&&
578 sha_de
->leaf
.data_len
== scan_leaf
->data_len
) {
579 memcpy(temp
.sha_hash
, sha_de
->sha_hash
,
580 SHA256_DIGEST_LENGTH
);
586 * Entry in CRC tree is fictitious, so we already had problems
587 * with this CRC. Upgrade (compute SHA) the candidate and
588 * dive into SHA subtree. If upgrade fails insert the candidate
589 * into Pass2 list (it will be processed later).
591 if (sha_de
== NULL
) {
592 if (upgrade_chksum(scan_leaf
, temp
.sha_hash
))
595 sha_de
= RB_FIND(sha_dedup_entry_rb_tree
,
596 &de
->u
.fict_root
, &temp
);
600 * Nothing in SHA subtree so far, so this is a new
601 * 'dataset'. Insert new entry into SHA subtree.
603 if (sha_de
== NULL
) {
604 sha_de
= calloc(sizeof(*sha_de
), 1);
605 sha_de
->leaf
= *scan_leaf
;
606 memcpy(sha_de
->sha_hash
, temp
.sha_hash
,
607 SHA256_DIGEST_LENGTH
);
608 RB_INSERT(sha_dedup_entry_rb_tree
, &de
->u
.fict_root
,
610 MemoryUse
+= sizeof(*sha_de
);
611 goto upgrade_stats_sha
;
615 * Found entry in SHA subtree, it means we have a potential
616 * dedup pair. Validate it (zones have to match and data_len
617 * field have to be the same too. If validation fails, treat
618 * it as a SHA collision (jump to sha256_failure).
620 if (validate_dedup_pair(&sha_de
->leaf
, scan_leaf
))
624 * We have a valid dedup pair (SHA match, validated).
626 * In case of technical failure (dedup pair was good, but
627 * ioctl failed anyways) insert the candidate into Pass2 list
628 * (we will try to dedup it after we are done with the rest of
631 * If ioctl fails because either of blocks is in the non-dedup
632 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
633 * bother with the candidate and terminate early.
635 * If ioctl fails because of bigblock underflow replace the
636 * leaf node that found dedup entry represents with scan_leaf.
638 error
= deduplicate(&sha_de
->leaf
, scan_leaf
);
641 goto upgrade_stats_sha
;
642 case DEDUP_TECH_FAILURE
:
644 case DEDUP_CMP_FAILURE
:
646 case DEDUP_INVALID_ZONE
:
647 goto terminate_early
;
648 case DEDUP_UNDERFLOW
:
650 sha_de
->leaf
= *scan_leaf
;
651 memcpy(sha_de
->sha_hash
, temp
.sha_hash
,
652 SHA256_DIGEST_LENGTH
);
653 goto upgrade_stats_sha
;
654 case DEDUP_VERS_FAILURE
:
656 "HAMMER filesystem must be at least "
657 "version 5 to dedup\n");
660 fprintf(stderr
, "Unknown error\n");
661 goto terminate_early
;
665 * Ooh la la.. SHA-256 collision. Terminate early, there's
666 * nothing we can do here.
669 ++dedup_sha_failures
;
670 goto terminate_early
;
673 * Candidate CRC is good for now (we found an entry in CRC
674 * tree and it's not fictitious). This means we have a
675 * potential dedup pair.
677 if (validate_dedup_pair(&de
->leaf
, scan_leaf
))
681 * We have a valid dedup pair (CRC match, validated)
683 error
= deduplicate(&de
->leaf
, scan_leaf
);
687 case DEDUP_TECH_FAILURE
:
689 case DEDUP_CMP_FAILURE
:
691 case DEDUP_INVALID_ZONE
:
692 goto terminate_early
;
693 case DEDUP_UNDERFLOW
:
695 de
->leaf
= *scan_leaf
;
697 case DEDUP_VERS_FAILURE
:
699 "HAMMER filesystem must be at least "
700 "version 5 to dedup\n");
703 fprintf(stderr
, "Unknown error\n");
704 goto terminate_early
;
709 * We got a CRC collision - either ioctl failed because of
710 * the comparison failure or validation of the potential
711 * dedup pair went bad. In all cases insert both blocks
712 * into SHA subtree (this requires checksum upgrade) and mark
713 * entry that corresponds to this CRC in the CRC tree
714 * fictitious, so that all futher operations with this CRC go
715 * through SHA subtree.
717 ++dedup_crc_failures
;
720 * Insert block that was represented by now fictitious dedup
721 * entry (create a new SHA entry and preserve stats of the
722 * old CRC one). If checksum upgrade fails insert the
723 * candidate into Pass2 list and return - keep both trees
726 sha_de
= calloc(sizeof(*sha_de
), 1);
727 sha_de
->leaf
= de
->leaf
;
728 sha_de
->ref_blks
= de
->u
.de
.ref_blks
;
729 sha_de
->ref_size
= de
->u
.de
.ref_size
;
730 if (upgrade_chksum(&sha_de
->leaf
, sha_de
->sha_hash
)) {
734 MemoryUse
+= sizeof(*sha_de
);
736 RB_INIT(&de
->u
.fict_root
);
738 * Here we can insert without prior checking because the tree
739 * is empty at this point
741 RB_INSERT(sha_dedup_entry_rb_tree
, &de
->u
.fict_root
, sha_de
);
744 * Mark entry in CRC tree fictitious
746 de
->flags
|= HAMMER_DEDUP_ENTRY_FICTITIOUS
;
749 * Upgrade checksum of the candidate and insert it into
750 * SHA subtree. If upgrade fails insert the candidate into
753 if (upgrade_chksum(scan_leaf
, temp
.sha_hash
)) {
756 sha_de
= RB_FIND(sha_dedup_entry_rb_tree
, &de
->u
.fict_root
,
759 /* There is an entry with this SHA already, but the only
760 * RB-tree element at this point is that entry we just
761 * added. We know for sure these blocks are different
762 * (this is crc_failure branch) so treat it as SHA
767 sha_de
= calloc(sizeof(*sha_de
), 1);
768 sha_de
->leaf
= *scan_leaf
;
769 memcpy(sha_de
->sha_hash
, temp
.sha_hash
, SHA256_DIGEST_LENGTH
);
770 RB_INSERT(sha_dedup_entry_rb_tree
, &de
->u
.fict_root
, sha_de
);
771 MemoryUse
+= sizeof(*sha_de
);
772 goto upgrade_stats_sha
;
776 de
->u
.de
.ref_blks
+= 1;
777 de
->u
.de
.ref_size
+= scan_leaf
->data_len
;
781 sha_de
->ref_blks
+= 1;
782 sha_de
->ref_size
+= scan_leaf
->data_len
;
787 * If in pass2 mode don't insert anything, fall through to
790 if ((flags
& DEDUP_PASS2
) == 0) {
791 pass2_de
= calloc(sizeof(*pass2_de
), 1);
792 pass2_de
->leaf
= *scan_leaf
;
793 STAILQ_INSERT_TAIL(&pass2_dedup_queue
, pass2_de
, sq_entry
);
794 dedup_skipped_size
+= scan_leaf
->data_len
;
800 * Early termination path. Fixup stats.
802 dedup_alloc_size
+= scan_leaf
->data_len
;
803 dedup_ref_size
+= scan_leaf
->data_len
;
808 upgrade_chksum(hammer_btree_leaf_elm_t leaf
, u_int8_t
*sha_hash
)
810 struct hammer_ioc_data data
;
811 char *buf
= malloc(DEDUP_BUF
);
815 bzero(&data
, sizeof(data
));
816 data
.elm
= leaf
->base
;
818 data
.size
= DEDUP_BUF
;
821 if (ioctl(glob_fd
, HAMMERIOC_GET_DATA
, &data
) < 0) {
822 fprintf(stderr
, "Get-data failed: %s\n", strerror(errno
));
826 DedupDataReads
+= leaf
->data_len
;
828 if (data
.leaf
.data_len
!= leaf
->data_len
) {
833 if (data
.leaf
.base
.btype
== HAMMER_BTREE_TYPE_RECORD
&&
834 data
.leaf
.base
.rec_type
== HAMMER_RECTYPE_DATA
) {
836 SHA256_Update(&ctx
, (void *)buf
, data
.leaf
.data_len
);
837 SHA256_Final(sha_hash
, &ctx
);
846 sigAlrm(int signo __unused
)
852 sigInfo(int signo __unused
)
858 scan_pfs(char *filesystem
, scan_pfs_cb_t func
, const char *id
)
860 struct hammer_ioc_mirror_rw mirror
;
861 hammer_ioc_mrecord_any_t mrec
;
862 struct hammer_btree_leaf_elm elm
;
863 char *buf
= malloc(DEDUP_BUF
);
870 DedupCurrentRecords
= 0;
871 signal(SIGINFO
, sigInfo
);
872 signal(SIGALRM
, sigAlrm
);
875 * Deduplication happens per element so hammer(8) is in full
876 * control of the ioctl()s to actually perform it. SIGALRM
877 * needs to be handled within hammer(8) but a checkpoint
878 * is needed for resuming. Use cycle file for that.
880 * Try to obtain the previous obj_id from the cycle file and
881 * if not available just start from the beginning.
883 bzero(&mirror
, sizeof(mirror
));
884 hammer_key_beg_init(&mirror
.key_beg
);
885 hammer_get_cycle(&mirror
.key_beg
, &mirror
.tid_beg
);
887 if (mirror
.key_beg
.obj_id
!= (int64_t)HAMMER_MIN_OBJID
) {
889 fprintf(stderr
, "%s: mirror-read: Resuming at object %016jx\n",
890 id
, (uintmax_t)mirror
.key_beg
.obj_id
);
893 hammer_key_end_init(&mirror
.key_end
);
895 mirror
.tid_beg
= glob_pfs
.ondisk
->sync_beg_tid
;
896 mirror
.tid_end
= glob_pfs
.ondisk
->sync_end_tid
;
897 mirror
.head
.flags
|= HAMMER_IOC_MIRROR_NODATA
; /* we want only keys */
899 mirror
.size
= DEDUP_BUF
;
900 mirror
.pfs_id
= glob_pfs
.pfs_id
;
901 mirror
.shared_uuid
= glob_pfs
.ondisk
->shared_uuid
;
903 if (VerboseOpt
&& DedupCrcStart
== 0) {
904 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
906 (uintmax_t)mirror
.key_beg
.obj_id
,
907 mirror
.key_beg
.localization
,
908 (uintmax_t)mirror
.key_end
.obj_id
,
909 mirror
.key_end
.localization
);
910 printf("%s %s: pfs_id %d\n",
911 id
, filesystem
, glob_pfs
.pfs_id
);
918 mirror
.pfs_id
= glob_pfs
.pfs_id
;
919 mirror
.shared_uuid
= glob_pfs
.ondisk
->shared_uuid
;
920 if (ioctl(glob_fd
, HAMMERIOC_MIRROR_READ
, &mirror
) < 0) {
921 fprintf(stderr
, "Mirror-read %s failed: %s\n",
922 filesystem
, strerror(errno
));
925 if (mirror
.head
.flags
& HAMMER_IOC_HEAD_ERROR
) {
926 fprintf(stderr
, "Mirror-read %s fatal error %d\n",
927 filesystem
, mirror
.head
.error
);
932 while (offset
< mirror
.count
) {
933 mrec
= (void *)((char *)buf
+ offset
);
934 bytes
= HAMMER_HEAD_DOALIGN(mrec
->head
.rec_size
);
935 if (offset
+ bytes
> mirror
.count
) {
936 fprintf(stderr
, "Misaligned record\n");
939 assert((mrec
->head
.type
&
940 HAMMER_MRECF_TYPE_MASK
) ==
941 HAMMER_MREC_TYPE_REC
);
943 elm
= mrec
->rec
.leaf
;
944 if (elm
.base
.btype
!= HAMMER_BTREE_TYPE_RECORD
)
946 if (elm
.base
.rec_type
!= HAMMER_RECTYPE_DATA
)
948 ++DedupCurrentRecords
;
949 if (DedupCrcStart
!= DedupCrcEnd
) {
950 if (elm
.data_crc
< DedupCrcStart
)
953 elm
.data_crc
>= DedupCrcEnd
) {
960 mirror
.key_beg
= mirror
.key_cur
;
961 if (DidInterrupt
|| SigAlrmFlag
) {
963 fprintf(stderr
, "%s\n",
964 (DidInterrupt
? "Interrupted" : "Timeout"));
965 hammer_set_cycle(&mirror
.key_cur
, mirror
.tid_beg
);
967 fprintf(stderr
, "Cyclefile %s updated for "
968 "continuation\n", CyclePath
);
972 if (DedupTotalRecords
) {
973 humanize_unsigned(buf1
, sizeof(buf1
),
976 humanize_unsigned(buf2
, sizeof(buf2
),
977 dedup_successes_bytes
,
979 fprintf(stderr
, "%s count %7jd/%jd "
981 "ioread %s newddup %s\n",
983 (intmax_t)DedupCurrentRecords
,
984 (intmax_t)DedupTotalRecords
,
985 (int)(DedupCurrentRecords
* 100 /
987 (int)(DedupCurrentRecords
* 10000 /
988 DedupTotalRecords
% 100),
991 fprintf(stderr
, "%s count %-7jd\n",
993 (intmax_t)DedupCurrentRecords
);
997 } while (mirror
.count
!= 0);
999 signal(SIGINFO
, SIG_IGN
);
1000 signal(SIGALRM
, SIG_IGN
);
1006 dump_simulated_dedup(void)
1008 struct sim_dedup_entry
*sim_de
;
1010 printf("=== Dumping simulated dedup entries:\n");
1011 RB_FOREACH(sim_de
, sim_dedup_entry_rb_tree
, &sim_dedup_tree
) {
1012 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1014 (intmax_t)sim_de
->ref_blks
,
1015 (intmax_t)sim_de
->ref_size
);
1017 printf("end of dump ===\n");
1021 dump_real_dedup(void)
1023 struct dedup_entry
*de
;
1024 struct sha_dedup_entry
*sha_de
;
1027 printf("=== Dumping dedup entries:\n");
1028 RB_FOREACH(de
, dedup_entry_rb_tree
, &dedup_tree
) {
1029 if (de
->flags
& HAMMER_DEDUP_ENTRY_FICTITIOUS
) {
1030 printf("\tcrc=%08x fictitious\n", de
->leaf
.data_crc
);
1032 RB_FOREACH(sha_de
, sha_dedup_entry_rb_tree
, &de
->u
.fict_root
) {
1033 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1035 sha_de
->leaf
.data_crc
,
1036 (intmax_t)sha_de
->ref_blks
,
1037 (intmax_t)sha_de
->ref_size
);
1038 for (i
= 0; i
< SHA256_DIGEST_LENGTH
; ++i
)
1039 printf("%02x", sha_de
->sha_hash
[i
]);
1043 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1045 (intmax_t)de
->u
.de
.ref_blks
,
1046 (intmax_t)de
->u
.de
.ref_size
);
1049 printf("end of dump ===\n");
1053 dedup_usage(int code
)
1056 "hammer dedup-simulate <filesystem>\n"
1057 "hammer dedup <filesystem>\n"