kernel - Consolidate kern_utimes() into kern_utimensat()
[dragonfly.git] / sbin / hammer / cmd_dedup.c
blob9314910f01e35570c8dd4c22db00ee90f44d6862
1 /*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Ilya Dryomov <idryomov@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
39 #define DEDUP_BUF (64 * 1024)
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44 RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46 rb_sim_dedup_entry_compare, hammer_crc_t);
48 struct sim_dedup_entry {
49 hammer_crc_t crc;
50 u_int64_t ref_blks; /* number of blocks referenced */
51 u_int64_t ref_size; /* size of data referenced */
52 RB_ENTRY(sim_dedup_entry) rb_entry;
55 struct dedup_entry {
56 struct hammer_btree_leaf_elm leaf;
57 union {
58 struct {
59 u_int64_t ref_blks;
60 u_int64_t ref_size;
61 } de;
62 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
63 } u;
64 u_int8_t flags;
65 RB_ENTRY(dedup_entry) rb_entry;
68 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001
70 struct sha_dedup_entry {
71 struct hammer_btree_leaf_elm leaf;
72 u_int64_t ref_blks;
73 u_int64_t ref_size;
74 u_int8_t sha_hash[SHA256_DIGEST_LENGTH];
75 RB_ENTRY(sha_dedup_entry) fict_entry;
78 /* Sorted list of HAMMER B-Tree keys */
79 struct dedup_entry_rb_tree;
80 struct sha_dedup_entry_rb_tree;
82 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
83 RB_INITIALIZER(&dedup_tree);
84 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
85 rb_dedup_entry_compare, hammer_crc_t);
87 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
88 rb_sha_dedup_entry_compare);
91 * Pass2 list - contains entries that were not dedup'ed because ioctl failed
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
96 struct pass2_dedup_entry {
97 struct hammer_btree_leaf_elm leaf;
98 STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
101 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */
103 static int SigInfoFlag;
104 static int SigAlrmFlag;
105 static int64_t DedupDataReads;
106 static int64_t DedupCurrentRecords;
107 static int64_t DedupTotalRecords;
108 static u_int32_t DedupCrcStart;
109 static u_int32_t DedupCrcEnd;
110 static u_int64_t MemoryUse;
112 /* PFS global ids - we deal with just one PFS at a run */
113 int glob_fd;
114 struct hammer_ioc_pseudofs_rw glob_pfs;
117 * Global accounting variables
119 * Last three don't have to be 64-bit, just to be safe..
121 u_int64_t dedup_alloc_size = 0;
122 u_int64_t dedup_ref_size = 0;
123 u_int64_t dedup_skipped_size = 0;
124 u_int64_t dedup_crc_failures = 0;
125 u_int64_t dedup_sha_failures = 0;
126 u_int64_t dedup_underflows = 0;
127 u_int64_t dedup_successes_count = 0;
128 u_int64_t dedup_successes_bytes = 0;
130 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
131 struct sim_dedup_entry *sim_de2);
132 static int rb_dedup_entry_compare(struct dedup_entry *de1,
133 struct dedup_entry *de2);
134 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
135 struct sha_dedup_entry *sha_de2);
136 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
137 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
138 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
142 static void dump_simulated_dedup(void);
143 static void dump_real_dedup(void);
144 static void dedup_usage(int code);
146 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
147 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
148 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
149 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
150 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
151 rb_sha_dedup_entry_compare);
153 static int
154 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
155 struct sim_dedup_entry *sim_de2)
157 if (sim_de1->crc < sim_de2->crc)
158 return (-1);
159 if (sim_de1->crc > sim_de2->crc)
160 return (1);
162 return (0);
165 static int
166 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
168 if (de1->leaf.data_crc < de2->leaf.data_crc)
169 return (-1);
170 if (de1->leaf.data_crc > de2->leaf.data_crc)
171 return (1);
173 return (0);
176 static int
177 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
178 struct sha_dedup_entry *sha_de2)
180 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
181 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
182 int i;
184 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
185 if (h1[i] < h2[i])
186 return (-1);
187 if (h1[i] > h2[i])
188 return (1);
191 return (0);
195 * dedup-simulate <filesystem>
197 void
198 hammer_cmd_dedup_simulate(char **av, int ac)
200 struct sim_dedup_entry *sim_de;
202 if (ac != 1)
203 dedup_usage(1);
205 glob_fd = getpfs(&glob_pfs, av[0]);
208 * Collection passes (memory limited)
210 printf("Dedup-simulate running\n");
211 do {
212 DedupCrcStart = DedupCrcEnd;
213 DedupCrcEnd = 0;
214 MemoryUse = 0;
216 if (VerboseOpt) {
217 printf("b-tree pass crc-range %08x-max\n",
218 DedupCrcStart);
219 fflush(stdout);
221 scan_pfs(av[0], collect_btree_elm, "simu-pass");
223 if (VerboseOpt >= 2)
224 dump_simulated_dedup();
227 * Calculate simulated dedup ratio and get rid of the tree
229 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
230 assert(sim_de->ref_blks != 0);
231 dedup_ref_size += sim_de->ref_size;
232 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
234 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
235 free(sim_de);
237 if (DedupCrcEnd && VerboseOpt == 0)
238 printf(".");
239 } while (DedupCrcEnd);
241 printf("Dedup-simulate %s succeeded\n", av[0]);
242 relpfs(glob_fd, &glob_pfs);
244 printf("Simulated dedup ratio = %.2f\n",
245 (dedup_alloc_size != 0) ?
246 (double)dedup_ref_size / dedup_alloc_size : 0);
250 * dedup <filesystem>
252 void
253 hammer_cmd_dedup(char **av, int ac)
255 struct dedup_entry *de;
256 struct sha_dedup_entry *sha_de;
257 struct pass2_dedup_entry *pass2_de;
258 char *tmp;
259 char buf[8];
260 int needfree = 0;
262 if (TimeoutOpt > 0)
263 alarm(TimeoutOpt);
265 if (ac != 1)
266 dedup_usage(1);
268 STAILQ_INIT(&pass2_dedup_queue);
270 glob_fd = getpfs(&glob_pfs, av[0]);
273 * A cycle file is _required_ for resuming dedup after the timeout
274 * specified with -t has expired. If no -c option, then place a
275 * .dedup.cycle file either in the PFS snapshots directory or in
276 * the default snapshots directory.
278 if (!CyclePath) {
279 if (glob_pfs.ondisk->snapshots[0] != '/')
280 asprintf(&tmp, "%s/%s/.dedup.cycle",
281 SNAPSHOTS_BASE, av[0]);
282 else
283 asprintf(&tmp, "%s/.dedup.cycle",
284 glob_pfs.ondisk->snapshots);
285 CyclePath = tmp;
286 needfree = 1;
290 * Pre-pass to cache the btree
292 scan_pfs(av[0], count_btree_elm, "pre-pass ");
293 DedupTotalRecords = DedupCurrentRecords;
296 * Collection passes (memory limited)
298 printf("Dedup running\n");
299 do {
300 DedupCrcStart = DedupCrcEnd;
301 DedupCrcEnd = 0;
302 MemoryUse = 0;
304 if (VerboseOpt) {
305 printf("b-tree pass crc-range %08x-max\n",
306 DedupCrcStart);
307 fflush(stdout);
309 scan_pfs(av[0], process_btree_elm, "main-pass");
311 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
312 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
313 dedup_skipped_size -= pass2_de->leaf.data_len;
315 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
316 free(pass2_de);
318 assert(STAILQ_EMPTY(&pass2_dedup_queue));
320 if (VerboseOpt >= 2)
321 dump_real_dedup();
324 * Calculate dedup ratio and get rid of the trees
326 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
327 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
328 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
329 assert(sha_de->ref_blks != 0);
330 dedup_ref_size += sha_de->ref_size;
331 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
333 RB_REMOVE(sha_dedup_entry_rb_tree,
334 &de->u.fict_root, sha_de);
335 free(sha_de);
337 assert(RB_EMPTY(&de->u.fict_root));
338 } else {
339 assert(de->u.de.ref_blks != 0);
340 dedup_ref_size += de->u.de.ref_size;
341 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
344 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
345 free(de);
347 assert(RB_EMPTY(&dedup_tree));
348 if (DedupCrcEnd && VerboseOpt == 0)
349 printf(".");
350 } while (DedupCrcEnd);
352 printf("Dedup %s succeeded\n", av[0]);
353 relpfs(glob_fd, &glob_pfs);
355 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
356 printf("Dedup ratio = %.2f (in this run)\n"
357 " %8s referenced\n",
358 ((dedup_alloc_size != 0) ?
359 (double)dedup_ref_size / dedup_alloc_size : 0),
362 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
363 printf(" %8s allocated\n", buf);
364 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
365 printf(" %8s skipped\n", buf);
366 printf(" %8jd CRC collisions\n"
367 " %8jd SHA collisions\n"
368 " %8jd bigblock underflows\n"
369 " %8jd new dedup records\n"
370 " %8jd new dedup bytes\n",
371 (intmax_t)dedup_crc_failures,
372 (intmax_t)dedup_sha_failures,
373 (intmax_t)dedup_underflows,
374 (intmax_t)dedup_successes_count,
375 (intmax_t)dedup_successes_bytes
378 /* Once completed remove cycle file */
379 hammer_reset_cycle();
381 /* We don't want to mess up with other directives */
382 if (needfree) {
383 free(tmp);
384 CyclePath = NULL;
388 static int
389 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
391 return(1);
394 static int
395 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
397 struct sim_dedup_entry *sim_de;
400 * If we are using too much memory we have to clean some out, which
401 * will cause the run to use multiple passes. Be careful of integer
402 * overflows!
404 if (MemoryUse > MemoryLimit) {
405 DedupCrcEnd = DedupCrcStart +
406 (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
407 if (VerboseOpt) {
408 printf("memory limit crc-range %08x-%08x\n",
409 DedupCrcStart, DedupCrcEnd);
410 fflush(stdout);
412 for (;;) {
413 sim_de = RB_MAX(sim_dedup_entry_rb_tree,
414 &sim_dedup_tree);
415 if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
416 break;
417 RB_REMOVE(sim_dedup_entry_rb_tree,
418 &sim_dedup_tree, sim_de);
419 MemoryUse -= sizeof(*sim_de);
420 free(sim_de);
425 * Collect statistics based on the CRC only, do not try to read
426 * any data blocks or run SHA hashes.
428 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
429 scan_leaf->data_crc);
431 if (sim_de == NULL) {
432 sim_de = calloc(sizeof(*sim_de), 1);
433 sim_de->crc = scan_leaf->data_crc;
434 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
435 MemoryUse += sizeof(*sim_de);
438 sim_de->ref_blks += 1;
439 sim_de->ref_size += scan_leaf->data_len;
440 return (1);
443 static __inline int
444 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
446 if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
447 (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
448 return (1);
450 if (p->data_len != q->data_len) {
451 return (1);
454 return (0);
457 #define DEDUP_TECH_FAILURE 1
458 #define DEDUP_CMP_FAILURE 2
459 #define DEDUP_INVALID_ZONE 3
460 #define DEDUP_UNDERFLOW 4
461 #define DEDUP_VERS_FAILURE 5
463 static __inline int
464 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
466 struct hammer_ioc_dedup dedup;
468 bzero(&dedup, sizeof(dedup));
471 * If data_offset fields are the same there is no need to run ioctl,
472 * candidate is already dedup'ed.
474 if (p->data_offset == q->data_offset) {
475 return (0);
478 dedup.elm1 = p->base;
479 dedup.elm2 = q->base;
480 RunningIoctl = 1;
481 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
482 if (errno == EOPNOTSUPP) {
483 /* must be at least version 5 */
484 return (DEDUP_VERS_FAILURE);
486 /* Technical failure - locking or w/e */
487 return (DEDUP_TECH_FAILURE);
489 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
490 return (DEDUP_CMP_FAILURE);
492 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
493 return (DEDUP_INVALID_ZONE);
495 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
496 return (DEDUP_UNDERFLOW);
498 RunningIoctl = 0;
499 ++dedup_successes_count;
500 dedup_successes_bytes += p->data_len;
501 return (0);
504 static int
505 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
507 struct dedup_entry *de;
508 struct sha_dedup_entry *sha_de, temp;
509 struct pass2_dedup_entry *pass2_de;
510 int error;
513 * If we are using too much memory we have to clean some out, which
514 * will cause the run to use multiple passes. Be careful of integer
515 * overflows!
517 while (MemoryUse > MemoryLimit) {
518 DedupCrcEnd = DedupCrcStart +
519 (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
520 if (VerboseOpt) {
521 printf("memory limit crc-range %08x-%08x\n",
522 DedupCrcStart, DedupCrcEnd);
523 fflush(stdout);
526 for (;;) {
527 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
528 if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
529 break;
530 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
531 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
532 NULL) {
533 RB_REMOVE(sha_dedup_entry_rb_tree,
534 &de->u.fict_root, sha_de);
535 MemoryUse -= sizeof(*sha_de);
536 free(sha_de);
539 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
540 MemoryUse -= sizeof(*de);
541 free(de);
546 * Collect statistics based on the CRC. Colliding CRCs usually
547 * cause a SHA sub-tree to be created under the de.
549 * Trivial case if de not found.
551 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
552 if (de == NULL) {
553 de = calloc(sizeof(*de), 1);
554 de->leaf = *scan_leaf;
555 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
556 MemoryUse += sizeof(*de);
557 goto upgrade_stats;
561 * Found entry in CRC tree
563 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
565 * Optimize the case where a CRC failure results in multiple
566 * SHA entries. If we unconditionally issue a data-read a
567 * degenerate situation where a colliding CRC's second SHA
568 * entry contains the lion's share of the deduplication
569 * candidates will result in excessive data block reads.
571 * Deal with the degenerate case by looking for a matching
572 * data_offset/data_len in the SHA elements we already have
573 * before reading the data block and generating a new SHA.
575 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
576 if (sha_de->leaf.data_offset ==
577 scan_leaf->data_offset &&
578 sha_de->leaf.data_len == scan_leaf->data_len) {
579 memcpy(temp.sha_hash, sha_de->sha_hash,
580 SHA256_DIGEST_LENGTH);
581 break;
586 * Entry in CRC tree is fictitious, so we already had problems
587 * with this CRC. Upgrade (compute SHA) the candidate and
588 * dive into SHA subtree. If upgrade fails insert the candidate
589 * into Pass2 list (it will be processed later).
591 if (sha_de == NULL) {
592 if (upgrade_chksum(scan_leaf, temp.sha_hash))
593 goto pass2_insert;
595 sha_de = RB_FIND(sha_dedup_entry_rb_tree,
596 &de->u.fict_root, &temp);
600 * Nothing in SHA subtree so far, so this is a new
601 * 'dataset'. Insert new entry into SHA subtree.
603 if (sha_de == NULL) {
604 sha_de = calloc(sizeof(*sha_de), 1);
605 sha_de->leaf = *scan_leaf;
606 memcpy(sha_de->sha_hash, temp.sha_hash,
607 SHA256_DIGEST_LENGTH);
608 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
609 sha_de);
610 MemoryUse += sizeof(*sha_de);
611 goto upgrade_stats_sha;
615 * Found entry in SHA subtree, it means we have a potential
616 * dedup pair. Validate it (zones have to match and data_len
617 * field have to be the same too. If validation fails, treat
618 * it as a SHA collision (jump to sha256_failure).
620 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
621 goto sha256_failure;
624 * We have a valid dedup pair (SHA match, validated).
626 * In case of technical failure (dedup pair was good, but
627 * ioctl failed anyways) insert the candidate into Pass2 list
628 * (we will try to dedup it after we are done with the rest of
629 * the tree).
631 * If ioctl fails because either of blocks is in the non-dedup
632 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
633 * bother with the candidate and terminate early.
635 * If ioctl fails because of bigblock underflow replace the
636 * leaf node that found dedup entry represents with scan_leaf.
638 error = deduplicate(&sha_de->leaf, scan_leaf);
639 switch(error) {
640 case 0:
641 goto upgrade_stats_sha;
642 case DEDUP_TECH_FAILURE:
643 goto pass2_insert;
644 case DEDUP_CMP_FAILURE:
645 goto sha256_failure;
646 case DEDUP_INVALID_ZONE:
647 goto terminate_early;
648 case DEDUP_UNDERFLOW:
649 ++dedup_underflows;
650 sha_de->leaf = *scan_leaf;
651 memcpy(sha_de->sha_hash, temp.sha_hash,
652 SHA256_DIGEST_LENGTH);
653 goto upgrade_stats_sha;
654 case DEDUP_VERS_FAILURE:
655 fprintf(stderr,
656 "HAMMER filesystem must be at least "
657 "version 5 to dedup\n");
658 exit (1);
659 default:
660 fprintf(stderr, "Unknown error\n");
661 goto terminate_early;
665 * Ooh la la.. SHA-256 collision. Terminate early, there's
666 * nothing we can do here.
668 sha256_failure:
669 ++dedup_sha_failures;
670 goto terminate_early;
671 } else {
673 * Candidate CRC is good for now (we found an entry in CRC
674 * tree and it's not fictitious). This means we have a
675 * potential dedup pair.
677 if (validate_dedup_pair(&de->leaf, scan_leaf))
678 goto crc_failure;
681 * We have a valid dedup pair (CRC match, validated)
683 error = deduplicate(&de->leaf, scan_leaf);
684 switch(error) {
685 case 0:
686 goto upgrade_stats;
687 case DEDUP_TECH_FAILURE:
688 goto pass2_insert;
689 case DEDUP_CMP_FAILURE:
690 goto crc_failure;
691 case DEDUP_INVALID_ZONE:
692 goto terminate_early;
693 case DEDUP_UNDERFLOW:
694 ++dedup_underflows;
695 de->leaf = *scan_leaf;
696 goto upgrade_stats;
697 case DEDUP_VERS_FAILURE:
698 fprintf(stderr,
699 "HAMMER filesystem must be at least "
700 "version 5 to dedup\n");
701 exit (1);
702 default:
703 fprintf(stderr, "Unknown error\n");
704 goto terminate_early;
707 crc_failure:
709 * We got a CRC collision - either ioctl failed because of
710 * the comparison failure or validation of the potential
711 * dedup pair went bad. In all cases insert both blocks
712 * into SHA subtree (this requires checksum upgrade) and mark
713 * entry that corresponds to this CRC in the CRC tree
714 * fictitious, so that all futher operations with this CRC go
715 * through SHA subtree.
717 ++dedup_crc_failures;
720 * Insert block that was represented by now fictitious dedup
721 * entry (create a new SHA entry and preserve stats of the
722 * old CRC one). If checksum upgrade fails insert the
723 * candidate into Pass2 list and return - keep both trees
724 * unmodified.
726 sha_de = calloc(sizeof(*sha_de), 1);
727 sha_de->leaf = de->leaf;
728 sha_de->ref_blks = de->u.de.ref_blks;
729 sha_de->ref_size = de->u.de.ref_size;
730 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
731 free(sha_de);
732 goto pass2_insert;
734 MemoryUse += sizeof(*sha_de);
736 RB_INIT(&de->u.fict_root);
738 * Here we can insert without prior checking because the tree
739 * is empty at this point
741 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
744 * Mark entry in CRC tree fictitious
746 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
749 * Upgrade checksum of the candidate and insert it into
750 * SHA subtree. If upgrade fails insert the candidate into
751 * Pass2 list.
753 if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
754 goto pass2_insert;
756 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
757 &temp);
758 if (sha_de != NULL)
759 /* There is an entry with this SHA already, but the only
760 * RB-tree element at this point is that entry we just
761 * added. We know for sure these blocks are different
762 * (this is crc_failure branch) so treat it as SHA
763 * collision.
765 goto sha256_failure;
767 sha_de = calloc(sizeof(*sha_de), 1);
768 sha_de->leaf = *scan_leaf;
769 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
770 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
771 MemoryUse += sizeof(*sha_de);
772 goto upgrade_stats_sha;
775 upgrade_stats:
776 de->u.de.ref_blks += 1;
777 de->u.de.ref_size += scan_leaf->data_len;
778 return (1);
780 upgrade_stats_sha:
781 sha_de->ref_blks += 1;
782 sha_de->ref_size += scan_leaf->data_len;
783 return (1);
785 pass2_insert:
787 * If in pass2 mode don't insert anything, fall through to
788 * terminate_early
790 if ((flags & DEDUP_PASS2) == 0) {
791 pass2_de = calloc(sizeof(*pass2_de), 1);
792 pass2_de->leaf = *scan_leaf;
793 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
794 dedup_skipped_size += scan_leaf->data_len;
795 return (1);
798 terminate_early:
800 * Early termination path. Fixup stats.
802 dedup_alloc_size += scan_leaf->data_len;
803 dedup_ref_size += scan_leaf->data_len;
804 return (0);
807 static int
808 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
810 struct hammer_ioc_data data;
811 char *buf = malloc(DEDUP_BUF);
812 SHA256_CTX ctx;
813 int error;
815 bzero(&data, sizeof(data));
816 data.elm = leaf->base;
817 data.ubuf = buf;
818 data.size = DEDUP_BUF;
820 error = 0;
821 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
822 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
823 error = 1;
824 goto done;
826 DedupDataReads += leaf->data_len;
828 if (data.leaf.data_len != leaf->data_len) {
829 error = 1;
830 goto done;
833 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
834 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
835 SHA256_Init(&ctx);
836 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
837 SHA256_Final(sha_hash, &ctx);
840 done:
841 free(buf);
842 return (error);
845 static void
846 sigAlrm(int signo __unused)
848 SigAlrmFlag = 1;
851 static void
852 sigInfo(int signo __unused)
854 SigInfoFlag = 1;
857 static void
858 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
860 struct hammer_ioc_mirror_rw mirror;
861 hammer_ioc_mrecord_any_t mrec;
862 struct hammer_btree_leaf_elm elm;
863 char *buf = malloc(DEDUP_BUF);
864 char buf1[8];
865 char buf2[8];
866 int offset, bytes;
868 SigInfoFlag = 0;
869 DedupDataReads = 0;
870 DedupCurrentRecords = 0;
871 signal(SIGINFO, sigInfo);
872 signal(SIGALRM, sigAlrm);
875 * Deduplication happens per element so hammer(8) is in full
876 * control of the ioctl()s to actually perform it. SIGALRM
877 * needs to be handled within hammer(8) but a checkpoint
878 * is needed for resuming. Use cycle file for that.
880 * Try to obtain the previous obj_id from the cycle file and
881 * if not available just start from the beginning.
883 bzero(&mirror, sizeof(mirror));
884 hammer_key_beg_init(&mirror.key_beg);
885 hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
887 if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
888 if (VerboseOpt)
889 fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
890 id, (uintmax_t)mirror.key_beg.obj_id);
893 hammer_key_end_init(&mirror.key_end);
895 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
896 mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
897 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
898 mirror.ubuf = buf;
899 mirror.size = DEDUP_BUF;
900 mirror.pfs_id = glob_pfs.pfs_id;
901 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
903 if (VerboseOpt && DedupCrcStart == 0) {
904 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
905 id, filesystem,
906 (uintmax_t)mirror.key_beg.obj_id,
907 mirror.key_beg.localization,
908 (uintmax_t)mirror.key_end.obj_id,
909 mirror.key_end.localization);
910 printf("%s %s: pfs_id %d\n",
911 id, filesystem, glob_pfs.pfs_id);
913 fflush(stdout);
914 fflush(stderr);
916 do {
917 mirror.count = 0;
918 mirror.pfs_id = glob_pfs.pfs_id;
919 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
920 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
921 fprintf(stderr, "Mirror-read %s failed: %s\n",
922 filesystem, strerror(errno));
923 exit(1);
925 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
926 fprintf(stderr, "Mirror-read %s fatal error %d\n",
927 filesystem, mirror.head.error);
928 exit(1);
930 if (mirror.count) {
931 offset = 0;
932 while (offset < mirror.count) {
933 mrec = (void *)((char *)buf + offset);
934 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
935 if (offset + bytes > mirror.count) {
936 fprintf(stderr, "Misaligned record\n");
937 exit(1);
939 assert((mrec->head.type &
940 HAMMER_MRECF_TYPE_MASK) ==
941 HAMMER_MREC_TYPE_REC);
942 offset += bytes;
943 elm = mrec->rec.leaf;
944 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
945 continue;
946 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
947 continue;
948 ++DedupCurrentRecords;
949 if (DedupCrcStart != DedupCrcEnd) {
950 if (elm.data_crc < DedupCrcStart)
951 continue;
952 if (DedupCrcEnd &&
953 elm.data_crc >= DedupCrcEnd) {
954 continue;
957 func(&elm, 0);
960 mirror.key_beg = mirror.key_cur;
961 if (DidInterrupt || SigAlrmFlag) {
962 if (VerboseOpt)
963 fprintf(stderr, "%s\n",
964 (DidInterrupt ? "Interrupted" : "Timeout"));
965 hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
966 if (VerboseOpt)
967 fprintf(stderr, "Cyclefile %s updated for "
968 "continuation\n", CyclePath);
969 exit(1);
971 if (SigInfoFlag) {
972 if (DedupTotalRecords) {
973 humanize_unsigned(buf1, sizeof(buf1),
974 DedupDataReads,
975 "B", 1024);
976 humanize_unsigned(buf2, sizeof(buf2),
977 dedup_successes_bytes,
978 "B", 1024);
979 fprintf(stderr, "%s count %7jd/%jd "
980 "(%02d.%02d%%) "
981 "ioread %s newddup %s\n",
983 (intmax_t)DedupCurrentRecords,
984 (intmax_t)DedupTotalRecords,
985 (int)(DedupCurrentRecords * 100 /
986 DedupTotalRecords),
987 (int)(DedupCurrentRecords * 10000 /
988 DedupTotalRecords % 100),
989 buf1, buf2);
990 } else {
991 fprintf(stderr, "%s count %-7jd\n",
993 (intmax_t)DedupCurrentRecords);
995 SigInfoFlag = 0;
997 } while (mirror.count != 0);
999 signal(SIGINFO, SIG_IGN);
1000 signal(SIGALRM, SIG_IGN);
1002 free(buf);
1005 static void
1006 dump_simulated_dedup(void)
1008 struct sim_dedup_entry *sim_de;
1010 printf("=== Dumping simulated dedup entries:\n");
1011 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1012 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1013 sim_de->crc,
1014 (intmax_t)sim_de->ref_blks,
1015 (intmax_t)sim_de->ref_size);
1017 printf("end of dump ===\n");
1020 static void
1021 dump_real_dedup(void)
1023 struct dedup_entry *de;
1024 struct sha_dedup_entry *sha_de;
1025 int i;
1027 printf("=== Dumping dedup entries:\n");
1028 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1029 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1030 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1032 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1033 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1034 "\t\tsha=",
1035 sha_de->leaf.data_crc,
1036 (intmax_t)sha_de->ref_blks,
1037 (intmax_t)sha_de->ref_size);
1038 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1039 printf("%02x", sha_de->sha_hash[i]);
1040 printf("\n");
1042 } else {
1043 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1044 de->leaf.data_crc,
1045 (intmax_t)de->u.de.ref_blks,
1046 (intmax_t)de->u.de.ref_size);
1049 printf("end of dump ===\n");
1052 static void
1053 dedup_usage(int code)
1055 fprintf(stderr,
1056 "hammer dedup-simulate <filesystem>\n"
1057 "hammer dedup <filesystem>\n"
1059 exit(code);